Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,30 @@ requires = [
"datasets",
"seqeval"
]
build-backend = "setuptools.build_meta"
build-backend = "setuptools.build_meta"

[tool.black]
line-length = 120
target-version = ['py36', 'py37', 'py38']
exclude = '''
(
/(
\.eggs
| \.git
| \.pytest_cache
| build
| dist
)/
)
'''
[tool.pytest.ini_options]
flake8-max-line-length = 210
flake8-ignore = ["E203", "W503"] # See https://github.com/PyCQA/pycodestyle/issues/373
markers = [
"integration",
]
[tool.mypy]
ignore_missing_imports = true

[tool.isort]
profile = "black"
12 changes: 10 additions & 2 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
transformers
datasets
pytest
pytest-mypy
pytest-isort
pytest-flake8
flake8-black
flake8<4.0.0
types-Deprecated
types-dataclasses
types-tabulate
types-requests
28 changes: 8 additions & 20 deletions src/dlkp/kp_dataset/datasets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os, sys
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

from datasets import ClassLabel, load_dataset


Expand Down Expand Up @@ -38,9 +40,7 @@ def set_labels(self):
def load_kp_datasets(self):
if self.data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
self.datasets = load_dataset(
self.data_args.dataset_name, self.data_args.dataset_config_name
)
self.datasets = load_dataset(self.data_args.dataset_name, self.data_args.dataset_config_name)
else:
data_files = {}
if self.data_args.train_file is not None:
Expand All @@ -63,9 +63,7 @@ def load_kp_datasets(self):
column_names = self.datasets["test"].column_names
features = self.datasets["test"].features
else:
raise AssertionError(
"neither train, validation nor test dataset is availabel"
)
raise AssertionError("neither train, validation nor test dataset is availabel")

if self.text_column_name is None:
self.text_column_name = (
Expand All @@ -75,9 +73,7 @@ def load_kp_datasets(self):
assert self.text_column_name in column_names

if self.label_column_name is None:
self.label_column_name = (
"doc_bio_tags" if "doc_bio_tags" in column_names else None
)
self.label_column_name = "doc_bio_tags" if "doc_bio_tags" in column_names else None
if len(column_names) > 2:
self.label_column_name = column_names[2]

Expand Down Expand Up @@ -132,11 +128,7 @@ def tokenize_and_align_labels_(self, examples):
# For the other tokens in a word, we set the label to either the current label or -100, depending on
# the label_all_tokens flag.
else:
label_ids.append(
self.label_to_id[label[word_idx]]
if self.data_args.label_all_tokens
else -100
)
label_ids.append(self.label_to_id[label[word_idx]] if self.data_args.label_all_tokens else -100)
# to avoid error change -100 to 'O' tag i.e. 2 class
# label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else 2)
previous_word_idx = word_idx
Expand Down Expand Up @@ -165,11 +157,7 @@ def extract_kp_from_tags(self, examples, idx):
ids = examples["input_ids"]
atn_mask = examples["special_tokens_mask"]
tokens = self.tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
tags = [
self.id_to_label[p]
for (p, m) in zip(self.predicted_labels[idx], atn_mask)
if m == 0
]
tags = [self.id_to_label[p] for (p, m) in zip(self.predicted_labels[idx], atn_mask) if m == 0]
assert len(tokens) == len(
tags
), "number of tags (={}) in prediction and tokens(={}) are not same for {}th".format(
Expand Down
8 changes: 3 additions & 5 deletions src/dlkp/kp_metrics/metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
from seqeval.scheme import IOB2, IOB1
import numpy as np
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
from seqeval.scheme import IOB1, IOB2


def compute_metrics(p):
Expand All @@ -27,9 +27,7 @@ def compute_metrics(p):
results = {}
# print("cal precisi")
# mode="strict"
results["overall_precision"] = precision_score(
true_labels, true_predictions, scheme=IOB2
)
results["overall_precision"] = precision_score(true_labels, true_predictions, scheme=IOB2)
results["overall_recall"] = recall_score(true_labels, true_predictions, scheme=IOB2)
# print("cal f1")
results["overall_f1"] = f1_score(true_labels, true_predictions, scheme=IOB2)
Expand Down
55 changes: 17 additions & 38 deletions src/dlkp/models/ke/crf/crf.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# add models having crf classification layer with option of bilstm layers

from .crf_utils import *
from typing import List, Tuple, Dict, Union
from typing import Dict, List, Tuple, Union

import torch

from .crf_utils import *

VITERBI_DECODING = Tuple[List[int], float]


Expand Down Expand Up @@ -64,9 +65,7 @@ def reset_parameters(self):
torch.nn.init.normal_(self.start_transitions)
torch.nn.init.normal_(self.end_transitions)

def _input_likelihood(
self, logits: torch.Tensor, mask: torch.BoolTensor
) -> torch.Tensor:
def _input_likelihood(self, logits: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
"""
Computes the (batch_size,) denominator term for the log-likelihood, which is the
sum of the likelihoods across all possible state sequences.
Expand Down Expand Up @@ -100,9 +99,7 @@ def _input_likelihood(

# In valid positions (mask == True) we want to take the logsumexp over the current_tag dimension
# of `inner`. Otherwise (mask == False) we want to retain the previous alpha.
alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * (
~mask[i]
).view(batch_size, 1)
alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * (~mask[i]).view(batch_size, 1)

# Every sequence needs to end with a transition to the stop_tag.
if self.include_start_end_transitions:
Expand All @@ -113,9 +110,7 @@ def _input_likelihood(
# Finally we log_sum_exp along the num_tags dim, result is (batch_size,)
return logsumexp(stops)

def _joint_likelihood(
self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor
) -> torch.Tensor:
def _joint_likelihood(self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
"""
Computes the numerator term for the log-likelihood, which is just score(inputs, tags)
"""
Expand Down Expand Up @@ -163,18 +158,14 @@ def _joint_likelihood(

# Add the last input if it's not masked.
last_inputs = logits[-1] # (batch_size, num_tags)
last_input_score = last_inputs.gather(
1, last_tags.view(-1, 1)
) # (batch_size, 1)
last_input_score = last_inputs.gather(1, last_tags.view(-1, 1)) # (batch_size, 1)
last_input_score = last_input_score.squeeze() # (batch_size,)

score = score + last_transition_score + last_input_score * mask[-1]

return score

def forward(
self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None
) -> torch.Tensor:
def forward(self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None) -> torch.Tensor:
"""
Computes the log likelihood.
"""
Expand Down Expand Up @@ -226,33 +217,21 @@ def viterbi_tags(
transitions = torch.Tensor(num_tags + 2, num_tags + 2).fill_(-10000.0)

# Apply transition constraints
constrained_transitions = self.transitions * self._constraint_mask[
:num_tags, :num_tags
] + -10000.0 * (1 - self._constraint_mask[:num_tags, :num_tags])
constrained_transitions = self.transitions * self._constraint_mask[:num_tags, :num_tags] + -10000.0 * (
1 - self._constraint_mask[:num_tags, :num_tags]
)
transitions[:num_tags, :num_tags] = constrained_transitions.data

if self.include_start_end_transitions:
transitions[
transitions[start_tag, :num_tags] = self.start_transitions.detach() * self._constraint_mask[
start_tag, :num_tags
] = self.start_transitions.detach() * self._constraint_mask[
start_tag, :num_tags
].data + -10000.0 * (
1 - self._constraint_mask[start_tag, :num_tags].detach()
)
transitions[
].data + -10000.0 * (1 - self._constraint_mask[start_tag, :num_tags].detach())
transitions[:num_tags, end_tag] = self.end_transitions.detach() * self._constraint_mask[
:num_tags, end_tag
] = self.end_transitions.detach() * self._constraint_mask[
:num_tags, end_tag
].data + -10000.0 * (
1 - self._constraint_mask[:num_tags, end_tag].detach()
)
].data + -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach())
else:
transitions[start_tag, :num_tags] = -10000.0 * (
1 - self._constraint_mask[start_tag, :num_tags].detach()
)
transitions[:num_tags, end_tag] = -10000.0 * (
1 - self._constraint_mask[:num_tags, end_tag].detach()
)
transitions[start_tag, :num_tags] = -10000.0 * (1 - self._constraint_mask[start_tag, :num_tags].detach())
transitions[:num_tags, end_tag] = -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach())

best_paths = []
# Pad the max sequence length by 2 to account for start_tag + end_tag.
Expand Down
Loading