diff --git a/pyproject.toml b/pyproject.toml
index ee3389f..fbd1715 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,4 +6,30 @@ requires = [
     "datasets",
     "seqeval"
 ]
-build-backend = "setuptools.build_meta"
\ No newline at end of file
+build-backend = "setuptools.build_meta"
+
+[tool.black]
+line-length = 120
+target-version = ['py36', 'py37', 'py38']
+exclude = '''
+(
+  /(
+      \.eggs
+    | \.git
+    | \.pytest_cache
+    | build
+    | dist
+  )/
+)
+'''
+[tool.pytest.ini_options]
+flake8-max-line-length = 210
+flake8-ignore = ["E203", "W503"]  # See https://github.com/PyCQA/pycodestyle/issues/373
+markers = [
+    "integration",
+]
+[tool.mypy]
+ignore_missing_imports = true
+
+[tool.isort]
+profile = "black"
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 37af714..72c198b 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,2 +1,10 @@
-transformers
-datasets
\ No newline at end of file
+pytest
+pytest-mypy
+pytest-isort
+pytest-flake8
+flake8-black
+flake8<4.0.0
+types-Deprecated
+types-dataclasses
+types-tabulate
+types-requests
\ No newline at end of file
diff --git a/src/dlkp/kp_dataset/datasets.py b/src/dlkp/kp_dataset/datasets.py
index ba14d28..97721b0 100644
--- a/src/dlkp/kp_dataset/datasets.py
+++ b/src/dlkp/kp_dataset/datasets.py
@@ -1,6 +1,8 @@
-import os, sys
+import os
+import sys
 from dataclasses import dataclass, field
 from typing import Optional
+
 from datasets import ClassLabel, load_dataset
 
 
@@ -38,9 +40,7 @@ def set_labels(self):
     def load_kp_datasets(self):
         if self.data_args.dataset_name is not None:
             # Downloading and loading a dataset from the hub.
-            self.datasets = load_dataset(
-                self.data_args.dataset_name, self.data_args.dataset_config_name
-            )
+            self.datasets = load_dataset(self.data_args.dataset_name, self.data_args.dataset_config_name)
         else:
             data_files = {}
             if self.data_args.train_file is not None:
@@ -63,9 +63,7 @@ def load_kp_datasets(self):
             column_names = self.datasets["test"].column_names
             features = self.datasets["test"].features
         else:
-            raise AssertionError(
-                "neither train, validation nor test dataset is availabel"
-            )
+            raise AssertionError("neither train, validation nor test dataset is availabel")
 
         if self.text_column_name is None:
             self.text_column_name = (
@@ -75,9 +73,7 @@ def load_kp_datasets(self):
         assert self.text_column_name in column_names
 
         if self.label_column_name is None:
-            self.label_column_name = (
-                "doc_bio_tags" if "doc_bio_tags" in column_names else None
-            )
+            self.label_column_name = "doc_bio_tags" if "doc_bio_tags" in column_names else None
             if len(column_names) > 2:
                 self.label_column_name = column_names[2]
 
@@ -132,11 +128,7 @@ def tokenize_and_align_labels_(self, examples):
                 # For the other tokens in a word, we set the label to either the current label or -100, depending on
                 # the label_all_tokens flag.
                 else:
-                    label_ids.append(
-                        self.label_to_id[label[word_idx]]
-                        if self.data_args.label_all_tokens
-                        else -100
-                    )
+                    label_ids.append(self.label_to_id[label[word_idx]] if self.data_args.label_all_tokens else -100)
                     # to avoid error change -100 to 'O' tag i.e. 2 class
                     # label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else 2)
                 previous_word_idx = word_idx
@@ -165,11 +157,7 @@ def extract_kp_from_tags(self, examples, idx):
         ids = examples["input_ids"]
         atn_mask = examples["special_tokens_mask"]
         tokens = self.tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
-        tags = [
-            self.id_to_label[p]
-            for (p, m) in zip(self.predicted_labels[idx], atn_mask)
-            if m == 0
-        ]
+        tags = [self.id_to_label[p] for (p, m) in zip(self.predicted_labels[idx], atn_mask) if m == 0]
         assert len(tokens) == len(
             tags
         ), "number of tags (={}) in prediction and tokens(={}) are not same for {}th".format(
diff --git a/src/dlkp/kp_metrics/metrics.py b/src/dlkp/kp_metrics/metrics.py
index 5901ec4..cf3a7c3 100644
--- a/src/dlkp/kp_metrics/metrics.py
+++ b/src/dlkp/kp_metrics/metrics.py
@@ -1,6 +1,6 @@
-from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
-from seqeval.scheme import IOB2, IOB1
 import numpy as np
+from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
+from seqeval.scheme import IOB1, IOB2
 
 
 def compute_metrics(p):
@@ -27,9 +27,7 @@ def compute_metrics(p):
     results = {}
     # print("cal precisi")
     # mode="strict"
-    results["overall_precision"] = precision_score(
-        true_labels, true_predictions, scheme=IOB2
-    )
+    results["overall_precision"] = precision_score(true_labels, true_predictions, scheme=IOB2)
     results["overall_recall"] = recall_score(true_labels, true_predictions, scheme=IOB2)
     # print("cal f1")
     results["overall_f1"] = f1_score(true_labels, true_predictions, scheme=IOB2)
diff --git a/src/dlkp/models/ke/crf/crf.py b/src/dlkp/models/ke/crf/crf.py
index 27786b5..c36f5e6 100644
--- a/src/dlkp/models/ke/crf/crf.py
+++ b/src/dlkp/models/ke/crf/crf.py
@@ -1,10 +1,11 @@
 # add models having crf classification layer with option of bilstm layers
 
-from .crf_utils import *
-from typing import List, Tuple, Dict, Union
+from typing import Dict, List, Tuple, Union
 
 import torch
 
+from .crf_utils import *
+
 VITERBI_DECODING = Tuple[List[int], float]
 
 
@@ -64,9 +65,7 @@ def reset_parameters(self):
             torch.nn.init.normal_(self.start_transitions)
             torch.nn.init.normal_(self.end_transitions)
 
-    def _input_likelihood(
-        self, logits: torch.Tensor, mask: torch.BoolTensor
-    ) -> torch.Tensor:
+    def _input_likelihood(self, logits: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
         """
         Computes the (batch_size,) denominator term for the log-likelihood, which is the
         sum of the likelihoods across all possible state sequences.
@@ -100,9 +99,7 @@ def _input_likelihood(
 
             # In valid positions (mask == True) we want to take the logsumexp over the current_tag dimension
             # of `inner`. Otherwise (mask == False) we want to retain the previous alpha.
-            alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * (
-                ~mask[i]
-            ).view(batch_size, 1)
+            alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * (~mask[i]).view(batch_size, 1)
 
         # Every sequence needs to end with a transition to the stop_tag.
         if self.include_start_end_transitions:
@@ -113,9 +110,7 @@ def _input_likelihood(
         # Finally we log_sum_exp along the num_tags dim, result is (batch_size,)
         return logsumexp(stops)
 
-    def _joint_likelihood(
-        self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor
-    ) -> torch.Tensor:
+    def _joint_likelihood(self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
         """
         Computes the numerator term for the log-likelihood, which is just score(inputs, tags)
         """
@@ -163,18 +158,14 @@ def _joint_likelihood(
 
         # Add the last input if it's not masked.
         last_inputs = logits[-1]  # (batch_size, num_tags)
-        last_input_score = last_inputs.gather(
-            1, last_tags.view(-1, 1)
-        )  # (batch_size, 1)
+        last_input_score = last_inputs.gather(1, last_tags.view(-1, 1))  # (batch_size, 1)
         last_input_score = last_input_score.squeeze()  # (batch_size,)
 
         score = score + last_transition_score + last_input_score * mask[-1]
 
         return score
 
-    def forward(
-        self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None
-    ) -> torch.Tensor:
+    def forward(self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None) -> torch.Tensor:
         """
         Computes the log likelihood.
         """
@@ -226,33 +217,21 @@ def viterbi_tags(
         transitions = torch.Tensor(num_tags + 2, num_tags + 2).fill_(-10000.0)
 
         # Apply transition constraints
-        constrained_transitions = self.transitions * self._constraint_mask[
-            :num_tags, :num_tags
-        ] + -10000.0 * (1 - self._constraint_mask[:num_tags, :num_tags])
+        constrained_transitions = self.transitions * self._constraint_mask[:num_tags, :num_tags] + -10000.0 * (
+            1 - self._constraint_mask[:num_tags, :num_tags]
+        )
         transitions[:num_tags, :num_tags] = constrained_transitions.data
 
         if self.include_start_end_transitions:
-            transitions[
+            transitions[start_tag, :num_tags] = self.start_transitions.detach() * self._constraint_mask[
                 start_tag, :num_tags
-            ] = self.start_transitions.detach() * self._constraint_mask[
-                start_tag, :num_tags
-            ].data + -10000.0 * (
-                1 - self._constraint_mask[start_tag, :num_tags].detach()
-            )
-            transitions[
+            ].data + -10000.0 * (1 - self._constraint_mask[start_tag, :num_tags].detach())
+            transitions[:num_tags, end_tag] = self.end_transitions.detach() * self._constraint_mask[
                 :num_tags, end_tag
-            ] = self.end_transitions.detach() * self._constraint_mask[
-                :num_tags, end_tag
-            ].data + -10000.0 * (
-                1 - self._constraint_mask[:num_tags, end_tag].detach()
-            )
+            ].data + -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach())
         else:
-            transitions[start_tag, :num_tags] = -10000.0 * (
-                1 - self._constraint_mask[start_tag, :num_tags].detach()
-            )
-            transitions[:num_tags, end_tag] = -10000.0 * (
-                1 - self._constraint_mask[:num_tags, end_tag].detach()
-            )
+            transitions[start_tag, :num_tags] = -10000.0 * (1 - self._constraint_mask[start_tag, :num_tags].detach())
+            transitions[:num_tags, end_tag] = -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach())
 
         best_paths = []
         # Pad the max sequence length by 2 to account for start_tag + end_tag.
diff --git a/src/dlkp/models/ke/crf/crf_trainer.py b/src/dlkp/models/ke/crf/crf_trainer.py
index 073f29e..912249f 100644
--- a/src/dlkp/models/ke/crf/crf_trainer.py
+++ b/src/dlkp/models/ke/crf/crf_trainer.py
@@ -1,11 +1,9 @@
-from transformers import (
-    Trainer,
-    set_seed,
-)
-from transformers.trainer import *
-from transformers.trainer_utils import PredictionOutput
 from torch import nn
 from torch.utils.data.dataloader import DataLoader
+from transformers import Trainer, set_seed
+from transformers.trainer import *
+from transformers.trainer_utils import PredictionOutput
+
 
 # from torch.utils.data.dataset import Dataset
 # from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -26,17 +24,13 @@ def prediction_loop(
         if not isinstance(dataloader.dataset, collections.abc.Sized):
             raise ValueError("dataset must implement __len__")
         prediction_loss_only = (
-            prediction_loss_only
-            if prediction_loss_only is not None
-            else self.args.prediction_loss_only
+            prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only
         )
 
         if self.args.deepspeed and not self.args.do_train:
             # no harm, but flagging to the user that deepspeed config is ignored for eval
             # flagging only for when --do_train wasn't passed as only then it's redundant
-            logger.info(
-                "Detected the deepspeed argument but it will not be used for evaluation"
-            )
+            logger.info("Detected the deepspeed argument but it will not be used for evaluation")
 
         model = self._wrap_model(self.model, training=False)
 
@@ -56,39 +50,27 @@ def prediction_loop(
 
         world_size = max(1, self.args.world_size)
 
-        eval_losses_gatherer = DistributedTensorGatherer(
-            world_size, num_examples, make_multiple_of=batch_size
-        )
+        eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size)
         if not prediction_loss_only:
             # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass
             # a batch size to the sampler)
             make_multiple_of = None
-            if hasattr(dataloader, "sampler") and isinstance(
-                dataloader.sampler, SequentialDistributedSampler
-            ):
+            if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler):
                 make_multiple_of = dataloader.sampler.batch_size
-            preds_gatherer = DistributedTensorGatherer(
-                world_size, num_examples, make_multiple_of=make_multiple_of
-            )
-            labels_gatherer = DistributedTensorGatherer(
-                world_size, num_examples, make_multiple_of=make_multiple_of
-            )
+            preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
+            labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
         if self.args.past_index >= 0:
             self._past = None
         model.eval()
 
         if is_torch_tpu_available():
-            dataloader = pl.ParallelLoader(
-                dataloader, [self.args.device]
-            ).per_device_loader(self.args.device)
+            dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device)
 
         self.callback_handler.eval_dataloader = dataloader
 
         for step, inputs in enumerate(dataloader):
 
-            loss, logits, labels = self.prediction_step(
-                model, inputs, prediction_loss_only, ignore_keys=ignore_keys
-            )
+            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
 
             best_path = self.eval_step(model, logits, inputs["attention_mask"])
             # best_path= self.eval_step(model, logits)
@@ -112,42 +94,19 @@ def prediction_loop(
             # assert logits.shape==labels.shape
             if loss is not None:
                 losses = loss.repeat(batch_size)
-                losses_host = (
-                    losses
-                    if losses_host is None
-                    else torch.cat((losses_host, losses), dim=0)
-                )
+                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
             if logits is not None:
-                preds_host = (
-                    logits
-                    if preds_host is None
-                    else nested_concat(preds_host, logits, padding_index=-100)
-                )
+                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
             if labels is not None:
-                labels_host = (
-                    labels
-                    if labels_host is None
-                    else nested_concat(labels_host, labels, padding_index=-100)
-                )
-            self.control = self.callback_handler.on_prediction_step(
-                self.args, self.state, self.control
-            )
+                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+            self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control)
 
             # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if (
-                self.args.eval_accumulation_steps is not None
-                and (step + 1) % self.args.eval_accumulation_steps == 0
-            ):
-                eval_losses_gatherer.add_arrays(
-                    self._gather_and_numpify(losses_host, "eval_losses")
-                )
+            if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0:
+                eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
                 if not prediction_loss_only:
-                    preds_gatherer.add_arrays(
-                        self._gather_and_numpify(preds_host, "eval_preds")
-                    )
-                    labels_gatherer.add_arrays(
-                        self._gather_and_numpify(labels_host, "eval_label_ids")
-                    )
+                    preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
+                    labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
 
                 # Set back to None to begin a new accumulation
                 losses_host, preds_host, labels_host = None, None, None
@@ -157,29 +116,17 @@ def prediction_loop(
             delattr(self, "_past")
 
         # Gather all remaining tensors and put them back on the CPU
-        eval_losses_gatherer.add_arrays(
-            self._gather_and_numpify(losses_host, "eval_losses")
-        )
+        eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
         if not prediction_loss_only:
-            preds_gatherer.add_arrays(
-                self._gather_and_numpify(preds_host, "eval_preds")
-            )
-            labels_gatherer.add_arrays(
-                self._gather_and_numpify(labels_host, "eval_label_ids")
-            )
+            preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
+            labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
 
         eval_loss = eval_losses_gatherer.finalize()
         preds = preds_gatherer.finalize() if not prediction_loss_only else None
         label_ids = labels_gatherer.finalize() if not prediction_loss_only else None
 
-        if (
-            self.compute_metrics is not None
-            and preds is not None
-            and label_ids is not None
-        ):
-            metrics = self.compute_metrics(
-                EvalPrediction(predictions=preds, label_ids=label_ids)
-            )
+        if self.compute_metrics is not None and preds is not None and label_ids is not None:
+            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
         else:
             metrics = {}
 
diff --git a/src/dlkp/models/ke/crf/crf_utils.py b/src/dlkp/models/ke/crf/crf_utils.py
index e9e3818..f83c335 100644
--- a/src/dlkp/models/ke/crf/crf_utils.py
+++ b/src/dlkp/models/ke/crf/crf_utils.py
@@ -1,11 +1,11 @@
 """
 Conditional random field utilis file 
 """
-from typing import List, Tuple, Dict, Union, Optional
+import logging
+import math
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
-import math
-import logging
 
 # from allennlp.common.checks import ConfigurationError
 # import allennlp.nn.util as util
@@ -15,9 +15,7 @@
 # logger = logging.get_logger(__name__)
 
 
-def allowed_transitions(
-    constraint_type: str, labels: Dict[int, str]
-) -> List[Tuple[int, int]]:
+def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tuple[int, int]]:
     """
     Given labels and a constraint type, returns the allowed transitions. It will
     additionally include transitions for the start and end states, which are used
@@ -56,16 +54,12 @@ def allowed_transitions(
             else:
                 to_tag = to_label[0]
                 to_entity = to_label[1:]
-            if is_transition_allowed(
-                constraint_type, from_tag, from_entity, to_tag, to_entity
-            ):
+            if is_transition_allowed(constraint_type, from_tag, from_entity, to_tag, to_entity):
                 allowed.append((from_label_index, to_label_index))
     return allowed
 
 
-def is_transition_allowed(
-    constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str
-):
+def is_transition_allowed(constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str):
     """
     Given a constraint type and strings `from_tag` and `to_tag` that
     represent the origin and destination of the transition, return whether
@@ -108,9 +102,7 @@ def is_transition_allowed(
                 from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"),
                 # B-x can only transition to I-x or L-x
                 # I-x can only transition to I-x or L-x
-                from_tag in ("B", "I")
-                and to_tag in ("I", "L")
-                and from_entity == to_entity,
+                from_tag in ("B", "I") and to_tag in ("I", "L") and from_entity == to_entity,
             ]
         )
     elif constraint_type == "BIO":
@@ -161,9 +153,7 @@ def is_transition_allowed(
         print("error in constrint type")
 
 
-def logsumexp(
-    tensor: torch.Tensor, dim: int = -1, keepdim: bool = False
-) -> torch.Tensor:
+def logsumexp(tensor: torch.Tensor, dim: int = -1, keepdim: bool = False) -> torch.Tensor:
     """
     A numerically stable computation of logsumexp. This is mathematically equivalent to
     `tensor.exp().sum(dim, keep=keepdim).log()`.  This function is typically used for summing log
@@ -236,15 +226,11 @@ def viterbi_decode(
     elif top_k >= 1:
         flatten_output = False
     else:
-        raise ValueError(
-            f"top_k must be either None or an integer >=1. Instead received {top_k}"
-        )
+        raise ValueError(f"top_k must be either None or an integer >=1. Instead received {top_k}")
 
     sequence_length, num_tags = list(tag_sequence.size())
 
-    has_start_end_restrictions = (
-        allowed_end_transitions is not None or allowed_start_transitions is not None
-    )
+    has_start_end_restrictions = allowed_end_transitions is not None or allowed_start_transitions is not None
 
     if has_start_end_restrictions:
 
@@ -259,12 +245,8 @@ def viterbi_decode(
 
         # Start and end transitions are fully defined, but cannot transition between each other.
 
-        allowed_start_transitions = torch.cat(
-            [allowed_start_transitions, torch.tensor([-math.inf, -math.inf])]
-        )
-        allowed_end_transitions = torch.cat(
-            [allowed_end_transitions, torch.tensor([-math.inf, -math.inf])]
-        )
+        allowed_start_transitions = torch.cat([allowed_start_transitions, torch.tensor([-math.inf, -math.inf])])
+        allowed_end_transitions = torch.cat([allowed_end_transitions, torch.tensor([-math.inf, -math.inf])])
 
         # First define how we may transition FROM the start and end tags.
         new_transition_matrix[-2, :] = allowed_start_transitions
diff --git a/src/dlkp/models/ke/extract_kp_text.py b/src/dlkp/models/ke/extract_kp_text.py
index ba1e3b6..4c05ad4 100644
--- a/src/dlkp/models/ke/extract_kp_text.py
+++ b/src/dlkp/models/ke/extract_kp_text.py
@@ -1,11 +1,11 @@
-import os, json
-from .extraction_utils import TrainingArguments, DataTrainingArguments, ModelArguments
+import json
+import os
+
+from .extraction_utils import DataTrainingArguments, ModelArguments, TrainingArguments
 from .kpe import run_extraction_model
 
 
-def extract_from_text(
-    text_list, model_name_or_path, use_CRF=False, output_dir="eval_output"
-):
+def extract_from_text(text_list, model_name_or_path, use_CRF=False, output_dir="eval_output"):
     # if output_dir is None:
     #     output_dir =
     # create a file and pass to extractor
diff --git a/src/dlkp/models/ke/extraction_utils.py b/src/dlkp/models/ke/extraction_utils.py
index 0ba2ebb..eeb0343 100644
--- a/src/dlkp/models/ke/extraction_utils.py
+++ b/src/dlkp/models/ke/extraction_utils.py
@@ -3,6 +3,7 @@
 import sys
 from dataclasses import dataclass, field
 from typing import Optional
+
 from transformers import TrainingArguments
 
 
@@ -13,39 +14,27 @@ class ModelArguments:
     """
 
     model_name_or_path: str = field(
-        metadata={
-            "help": "Path to pretrained model or model identifier from huggingface.co/models"
-        }
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
     )
     model_family_name: str = field(
         default="auto",
-        metadata={
-            "help": "name of the family of model, bert, longformer, reformer etc."
-        },
+        metadata={"help": "name of the family of model, bert, longformer, reformer etc."},
     )
     config_name: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "Pretrained config name or path if not the same as model_name"
-        },
+        metadata={"help": "Pretrained config name or path if not the same as model_name"},
     )
     tokenizer_name: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "Pretrained tokenizer name or path if not the same as model_name"
-        },
+        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
     )
     cache_dir: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "Where do you want to store the pretrained models downloaded from huggingface.co"
-        },
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
     )
     model_revision: str = field(
         default="main",
-        metadata={
-            "help": "The specific model version to use (can be a branch name, tag name or commit id)."
-        },
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
     use_CRF: bool = field(
         default=False,
@@ -63,9 +52,7 @@ class DataTrainingArguments:
     Arguments pertaining to what data we are going to input our model for training and eval.
     """
 
-    task_name: Optional[str] = field(
-        default="token", metadata={"help": "The name of the task token, crf"}
-    )
+    task_name: Optional[str] = field(default="token", metadata={"help": "The name of the task token, crf"})
 
     train_file: Optional[str] = field(
         default=None,
@@ -73,28 +60,20 @@ class DataTrainingArguments:
     )
     validation_file: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."
-        },
+        metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
     )
     test_file: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "An optional input test data file to predict on (a csv or JSON file)."
-        },
+        metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
     )
 
     text_column_name: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "An optional input test data file to predict on (a csv or JSON file)."
-        },
+        metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
     )
     label_column_name: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "An optional input test data file to predict on (a csv or JSON file)."
-        },
+        metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
     )
     train_data_percent: Optional[int] = field(
         default=100,
@@ -134,9 +113,7 @@ class DataTrainingArguments:
     )
     return_entity_level_metrics: bool = field(
         default=False,
-        metadata={
-            "help": "Whether to return all the entity levels during evaluation or just the overall ones."
-        },
+        metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
     )
     dataset_name: Optional[str] = field(
         default=None,
@@ -144,9 +121,7 @@ class DataTrainingArguments:
     )
     dataset_config_name: Optional[str] = field(
         default="extraction",
-        metadata={
-            "help": "The configuration name of the dataset to use (via the datasets library)."
-        },
+        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."},
     )
     cache_file_name: Optional[str] = field(
         default=None,
@@ -162,9 +137,7 @@ def __post_init__(self):
             and self.validation_file is None
             and self.test_file is None
         ):
-            raise ValueError(
-                "Need either a dataset name or a training/validation file."
-            )
+            raise ValueError("Need either a dataset name or a training/validation file.")
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
@@ -185,15 +158,10 @@ def __post_init__(self):
                     "json",
                 ], "`test_file` should be a csv or a json file."
         self.task_name = self.task_name.lower()
-        assert (
-            self.train_data_percent + self.test_data_percent + self.valid_data_percent
-            == 100
-        )
+        assert self.train_data_percent + self.test_data_percent + self.valid_data_percent == 100
 
 
-def tokenize_and_align_labels(
-    examples, tokenizer, text_column_name, padding, label_column_name=None
-):
+def tokenize_and_align_labels(examples, tokenizer, text_column_name, padding, label_column_name=None):
     tokenized_inputs = tokenizer(
         examples[text_column_name],
         padding=padding,
@@ -218,9 +186,7 @@ def tokenize_and_align_labels(
             # For the other tokens in a word, we set the label to either the current label or -100, depending on
             # the label_all_tokens flag.
             else:
-                label_ids.append(
-                    label_to_id[label[word_idx]] if data_args.label_all_tokens else -100
-                )
+                label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100)
                 # to avoid error change -100 to 'O' tag i.e. 2 class
                 # label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else 2)
             previous_word_idx = word_idx
diff --git a/src/dlkp/models/ke/kpe.py b/src/dlkp/models/ke/kpe.py
index 38b1d3c..75cc7d1 100644
--- a/src/dlkp/models/ke/kpe.py
+++ b/src/dlkp/models/ke/kpe.py
@@ -29,34 +29,31 @@
 
 import numpy as np
 import pandas as pd
-
 import transformers
 from transformers import (
     AutoConfig,
     AutoModelForTokenClassification,
     AutoTokenizer,
+    BertForTokenClassification,
     DataCollatorForTokenClassification,
     HfArgumentParser,
     PreTrainedTokenizerFast,
     Trainer,
     TrainingArguments,
     set_seed,
-    BertForTokenClassification,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 
-from .transformer.crf_models import (
-    BERT_CRFforTokenClassification,
-    AutoCRFforTokenClassification,
-)
-from .transformer.token_classification_models import (
-    LongformerForTokenClassification,
-)
-from .crf.crf_trainer import CRF_Trainer
+from ...kp_dataset.datasets import KpExtractionDatasets
 
 # from extraction_utils import ModelArguments, DataTrainingArguments
 from ...kp_metrics.metrics import compute_metrics
-from ...kp_dataset.datasets import KpExtractionDatasets
+from .crf.crf_trainer import CRF_Trainer
+from .transformer.crf_models import (
+    AutoCRFforTokenClassification,
+    BERT_CRFforTokenClassification,
+)
+from .transformer.token_classification_models import LongformerForTokenClassification
 
 logger = logging.getLogger(__name__)
 
@@ -86,11 +83,7 @@ def run_extraction_model(model_args, data_args, training_args):
 
     # Detecting last checkpoint.
     last_checkpoint = None
-    if (
-        os.path.isdir(training_args.output_dir)
-        and training_args.do_train
-        and not training_args.overwrite_output_dir
-    ):
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
             raise ValueError(
@@ -132,9 +125,7 @@ def run_extraction_model(model_args, data_args, training_args):
     # The .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name
-        if model_args.tokenizer_name
-        else model_args.model_name_or_path,
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=True,
         add_prefix_space=True,
@@ -156,20 +147,14 @@ def run_extraction_model(model_args, data_args, training_args):
 
     # config
     config = AutoConfig.from_pretrained(
-        model_args.config_name
-        if model_args.config_name
-        else model_args.model_name_or_path,
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         num_labels=num_labels,
         cache_dir=model_args.cache_dir,
     )
     config.use_CRF = model_args.use_CRF
 
     # model
-    model = (
-        AutoCRFforTokenClassification
-        if model_args.use_CRF
-        else AutoModelForTokenClassification
-    )
+    model = AutoCRFforTokenClassification if model_args.use_CRF else AutoModelForTokenClassification
     model = model.from_pretrained(
         model_args.model_name_or_path,
         config=config,
@@ -177,9 +162,7 @@ def run_extraction_model(model_args, data_args, training_args):
     )
 
     # Data collator
-    data_collator = DataCollatorForTokenClassification(
-        tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None
-    )
+    data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
 
     # Initialize our Trainer
     trainer = TRAINER_DICT["crf" if model_args.use_CRF else "token"](
@@ -212,18 +195,14 @@ def run_extraction_model(model_args, data_args, training_args):
                     writer.write(f"{key} = {value}\n")
 
             # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
-            trainer.state.save_to_json(
-                os.path.join(training_args.output_dir, "trainer_state.json")
-            )
+            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))
 
     # Evaluation
     results = {}
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
         results = trainer.evaluate()
-        output_eval_file = os.path.join(
-            training_args.output_dir, "eval_results_KPE.txt"
-        )
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_KPE.txt")
         if trainer.is_world_process_zero():
             with open(output_eval_file, "w") as writer:
                 logger.info("***** Eval results *****")
@@ -246,25 +225,17 @@ def run_extraction_model(model_args, data_args, training_args):
         #     for prediction in predictions
         # ]
 
-        output_test_results_file = os.path.join(
-            training_args.output_dir, "test_results.txt"
-        )
+        output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
         if trainer.is_world_process_zero():
             with open(output_test_results_file, "w") as writer:
                 for key, value in sorted(metrics.items()):
                     logger.info(f"  {key} = {value}")
                     writer.write(f"{key} = {value}\n")
 
-        output_test_predictions_file = os.path.join(
-            training_args.output_dir, "test_predictions.csv"
-        )
-        output_test_predictions_BIO_file = os.path.join(
-            training_args.output_dir, "test_predictions_BIO.txt"
-        )
+        output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.csv")
+        output_test_predictions_BIO_file = os.path.join(training_args.output_dir, "test_predictions_BIO.txt")
         if trainer.is_world_process_zero():
-            predicted_kps = dataset.get_extracted_keyphrases(
-                predicted_labels=predictions
-            )
+            predicted_kps = dataset.get_extracted_keyphrases(predicted_labels=predictions)
             df = pd.DataFrame.from_dict({"extractive_keyphrase": predicted_kps})
             df.to_csv(output_test_predictions_file, index=False)
 
diff --git a/src/dlkp/models/ke/transformer/crf_models.py b/src/dlkp/models/ke/transformer/crf_models.py
index 75f2fff..322ac7e 100644
--- a/src/dlkp/models/ke/transformer/crf_models.py
+++ b/src/dlkp/models/ke/transformer/crf_models.py
@@ -1,17 +1,18 @@
 # all token classification model with crf head
-from transformers import (
-    AutoModelForPreTraining,
+import collections
+
+from transformers import (  # PretrainedModel,
     AutoModel,
+    AutoModelForPreTraining,
+    AutoModelForTokenClassification,
     BertModel,
     BertPreTrainedModel,
     LongformerModel,
     PreTrainedModel,
-    AutoModelForTokenClassification,
-    # PretrainedModel,
 )
 from transformers.modeling_outputs import TokenClassifierOutput
-import collections
 from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel
+
 from ..crf.crf import ConditionalRandomField
 
 
@@ -42,9 +43,7 @@ def forward(
         output_attentions=None,
         return_dict=None,
     ):
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.base_model(
             input_ids,
@@ -96,9 +95,7 @@ def __init__(self, config):
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
         # self.crf= nn.Linear(config.num_labels,1)
         # self.crf= ConditionalRandomField(self.num_labels)
-        self.crf = ConditionalRandomField(
-            self.num_labels, label_encoding="BIO", idx2tag={0: "B", 1: "I", 2: "0"}
-        )
+        self.crf = ConditionalRandomField(self.num_labels, label_encoding="BIO", idx2tag={0: "B", 1: "I", 2: "0"})
         self.init_weights()
 
     def forward(
@@ -113,9 +110,7 @@ def forward(
         output_attentions=None,
         return_dict=None,
     ):
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.bert(
             input_ids,
@@ -181,9 +176,7 @@ def forward(
         output_attentions=None,
         return_dict=None,
     ):
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.longformer(
             input_ids,
diff --git a/src/dlkp/models/ke/transformer/token_classification_models.py b/src/dlkp/models/ke/transformer/token_classification_models.py
index d286ac1..aacf4ff 100644
--- a/src/dlkp/models/ke/transformer/token_classification_models.py
+++ b/src/dlkp/models/ke/transformer/token_classification_models.py
@@ -7,23 +7,21 @@
 
 import numpy as np
 from datasets import ClassLabel, load_dataset, load_metric
-
 from transformers import (
     AutoConfig,
+    AutoModel,
     AutoModelForTokenClassification,
     AutoTokenizer,
-    AutoModel,
     DataCollatorForTokenClassification,
     HfArgumentParser,
+    LongformerForTokenClassification,
     PreTrainedTokenizerFast,
     Trainer,
     TrainingArguments,
     set_seed,
-    LongformerForTokenClassification,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 
-
 logger = logging.getLogger(__name__)
 
 from transformers.models.reformer.modeling_reformer import *
@@ -51,9 +49,7 @@ def forward(
         output_attentions=None,
         return_dict=None,
     ):
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.reformer(
             input_ids,
diff --git a/src/run_auto_ke.py b/src/run_auto_ke.py
index bbeeee5..898ae0a 100644
--- a/src/run_auto_ke.py
+++ b/src/run_auto_ke.py
@@ -1,10 +1,11 @@
 from statistics import mode
-from dlkp.models.ke.kpe import run_extraction_model
+
 from dlkp.models.ke.extraction_utils import (
     DataTrainingArguments,
     ModelArguments,
     TrainingArguments,
 )
+from dlkp.models.ke.kpe import run_extraction_model
 
 training_args = TrainingArguments(
     output_dir="/media/nas_mount/Debanjan/amardeep/dlkp_out/inpec_debug_eval",  # todo