diff --git a/lightautoml/automl/presets/tabular_presets.py b/lightautoml/automl/presets/tabular_presets.py index 166cb653..4b46e1da 100755 --- a/lightautoml/automl/presets/tabular_presets.py +++ b/lightautoml/automl/presets/tabular_presets.py @@ -609,6 +609,7 @@ def create_automl(self, **fit_args): "autoint", "tabnet", "fttransformer", + "saint", ] available_nn_models = available_nn_models + [x + "_tuned" for x in available_nn_models] nn_models = [ diff --git a/lightautoml/dataset/base.py b/lightautoml/dataset/base.py index a033e7db..5a107f19 100644 --- a/lightautoml/dataset/base.py +++ b/lightautoml/dataset/base.py @@ -365,6 +365,19 @@ def shape(self) -> Tuple[Optional[int], Optional[int]]: return rows, cols # static methods - how to make 1d slice, 2s slice, concat of feature matrix etc ... + @staticmethod + def _vstack(datasets: Sequence[Any]) -> Any: + """Abstract method - define horizontal stack of feature arrays. + + Args: + datasets: Sequence of feature arrays. + + Returns: # noqa DAR202 + Single feature array. + + """ + raise NotImplementedError("Horizontal Stack not implemented.") + @staticmethod def _hstack(datasets: Sequence[Any]) -> Any: """Abstract method - define horizontal stack of feature arrays. @@ -472,7 +485,42 @@ def concat(cls, datasets: Sequence["LAMLDataset"]) -> "LAMLDataset": dataset.set_data(data, features, roles) return dataset + @classmethod + def vconcat(cls, datasets: Sequence["LAMLDataset"]) -> "LAMLDataset": + """Concat multiple dataset. + + Default behavior - takes empty dataset from datasets[0] + and concat all features from others. + + Args: + datasets: Sequence of datasets. + + Returns: + Concated dataset. + + """ + for check in cls._concat_checks: + check(datasets) + + dataset = datasets[0].empty() + data = [] + features = [*datasets[0].features] + roles = {**datasets[0].roles} + atrs = set(dataset._array_like_attrs) + for ds in datasets: + data.append(ds.data) + for atr in ds._array_like_attrs: + if atr not in atrs: + dataset._array_like_attrs.append(atr) + dataset.__dict__[atr] = ds.__dict__[atr] + atrs.update({atr}) + + data = cls._vstack(data) + dataset.set_data(data, features, roles) + + return dataset + def drop_features(self, droplist: Sequence[str]): """Inplace drop columns from dataset. diff --git a/lightautoml/dataset/np_pd_dataset.py b/lightautoml/dataset/np_pd_dataset.py index 3ec8789c..bffc37c4 100644 --- a/lightautoml/dataset/np_pd_dataset.py +++ b/lightautoml/dataset/np_pd_dataset.py @@ -212,6 +212,18 @@ def _hstack(datasets: Sequence[np.ndarray]) -> np.ndarray: """ return np.hstack(datasets) + @staticmethod + def _vstack(datasets: Sequence[np.ndarray]) -> np.ndarray: + """Concatenate function for numpy arrays. + + Args: + datasets: Sequence of np.ndarray. + + Returns: + Stacked features array. + + """ + return np.vstack(datasets) @staticmethod def _get_rows(data: np.ndarray, k: IntIdx) -> np.ndarray: @@ -400,6 +412,17 @@ def _hstack(datasets: Sequence[Union[sparse.csr_matrix, np.ndarray]]) -> sparse. """ return sparse.hstack(datasets, format="csr") + def _vstack(datasets: Sequence[Union[sparse.csr_matrix, np.ndarray]]) -> sparse.csr_matrix: + """Concatenate function for sparse and numpy arrays. + + Args: + datasets: Sequence of csr_matrix or np.ndarray. + + Returns: + Sparse matrix. + + """ + return sparse.vstack(datasets, format="csr") def __init__( self, @@ -609,6 +632,19 @@ def _hstack(datasets: Sequence[DataFrame]) -> DataFrame: """ return pd.concat(datasets, axis=1) + + @staticmethod + def _vstack(datasets: Sequence[DataFrame]) -> DataFrame: + """Define how to concat features arrays. + + Args: + datasets: Sequence of tables. + + Returns: + concatenated table. + + """ + return pd.concat(datasets, axis=0) @staticmethod def _get_rows(data: DataFrame, k: IntIdx) -> FrameOrSeries: diff --git a/lightautoml/dataset/utils.py b/lightautoml/dataset/utils.py index 5f3410e5..158e9fa0 100644 --- a/lightautoml/dataset/utils.py +++ b/lightautoml/dataset/utils.py @@ -158,3 +158,118 @@ def concatenate(datasets: Sequence[LAMLDataset]) -> LAMLDataset: datasets = [datasets[n]] + [x for (y, x) in enumerate(datasets) if n != y] return conc(datasets) + + + +def get_common_vconcat( + datasets: Sequence[LAMLDataset], +) -> Tuple[Callable, Optional[type]]: + """Get concatenation function for datasets of different types. + + Takes multiple datasets as input and check, + if is's ok to concatenate it and return function. + + Args: + datasets: Sequence of datasets. + + Returns: + Function, that is able to concatenate datasets. + + """ + # TODO: Add pandas + numpy via transforming to numpy? + dataset_types = set([type(x) for x in datasets]) + + # general - if single type, concatenation for that type + if len(dataset_types) == 1: + klass = list(dataset_types)[0] + return klass.vconcat, None + + # np and sparse goes to sparse + elif dataset_types == {NumpyDataset, CSRSparseDataset}: + return CSRSparseDataset.vconcat, CSRSparseDataset + + elif dataset_types == {NumpyDataset, PandasDataset}: + return numpy_and_pandas_vconcat, None + + elif (dataset_types == {NumpyDataset, SeqNumpyPandasDataset}) or ( + dataset_types == {PandasDataset, SeqNumpyPandasDataset} + ): + return numpy_or_pandas_and_seq_vconcat, None + + raise TypeError("Unable to concatenate dataset types {0}".format(list(dataset_types))) + + +def numpy_and_pandas_vconcat(datasets: Sequence[Union[NumpyDataset, PandasDataset]]) -> PandasDataset: + """Concat of numpy and pandas dataset. + + Args: + datasets: Sequence of datasets to concatenate. + + Returns: + Concatenated dataset. + + """ + datasets = [x.to_pandas() for x in datasets] + + return PandasDataset.vconcat(datasets) + + +def numpy_or_pandas_and_seq_vconcat( + datasets: Sequence[Union[NumpyDataset, PandasDataset, SeqNumpyPandasDataset]] +) -> Union[NumpyDataset, PandasDataset]: + """Concat plain and sequential dataset. + + If both datasets have same size then concat them as plain, otherwise include seq dataset inside plain one. + + Args: + datasets: one plain and one seq dataset. + + Returns: + Concatenated dataset. + + """ + assert len(datasets) == 2, "should be 1 sequential and 1 plain dataset" + # get 1 numpy / pandas dataset + for n, dataset in enumerate(datasets): + if type(dataset) == SeqNumpyPandasDataset: + seq_dataset = dataset + else: + plain_dataset = dataset + + if len(seq_dataset.data) == len(plain_dataset): + return SeqNumpyPandasDataset.vconcat([seq_dataset, plain_dataset.to_pandas()]) + else: + if hasattr(plain_dataset, "seq_data"): + plain_dataset.seq_data[seq_dataset.name] = seq_dataset + else: + plain_dataset.seq_data = {seq_dataset.name: seq_dataset} + + return plain_dataset + + +def vconcatenate(datasets: Sequence[LAMLDataset]) -> LAMLDataset: + """Dataset concatenation function. + + Check if datasets have common concat function and then apply. + Assume to take target/folds/weights etc from first one. + + Args: + datasets: Sequence of datasets. + + Returns: + Dataset with concatenated features. + + """ + conc, klass = get_common_vconcat([ds for ds in datasets if ds is not None]) + + # this part is made to avoid setting first dataset of required type + if klass is not None: + + n = 0 + for n, ds in enumerate(datasets): + if type(ds) is klass: + break + + datasets = [datasets[n]] + [x for (y, x) in enumerate(datasets) if n != y] + + return conc(datasets) \ No newline at end of file diff --git a/lightautoml/ml_algo/base.py b/lightautoml/ml_algo/base.py index 0dec5aba..74c3c6da 100755 --- a/lightautoml/ml_algo/base.py +++ b/lightautoml/ml_algo/base.py @@ -16,7 +16,7 @@ import numpy as np -from lightautoml.validation.base import TrainValidIterator +from lightautoml.validation.base import HoldoutIterator, TrainValidIterator from ..dataset.base import LAMLDataset from ..dataset.np_pd_dataset import CSRSparseDataset @@ -271,8 +271,8 @@ def fit_predict(self, train_valid_iterator: TrainValidIterator) -> NumpyDataset: "===== Start working with \x1b[1mfold {}\x1b[0m for \x1b[1m{}\x1b[0m =====".format(n, self._name) ) self.timer.set_control_point() - - model, pred = self.fit_predict_single_fold(train, valid) + self.params['is_holdout'] = isinstance(train_valid_iterator,HoldoutIterator) + model, pred = self.fit_predict_single_fold(train, valid, 0) self.models.append(model) preds_arr[idx] += pred.reshape((pred.shape[0], -1)) counter_arr[idx] += 1 diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 4fe7240c..118b564a 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -1,6 +1,8 @@ """Neural net for tabular datasets.""" +from lightautoml.dataset.base import LAMLDataset +from lightautoml.dataset.utils import vconcatenate from lightautoml.utils.installation import __validate_extra_deps @@ -51,6 +53,8 @@ DenseEmbeddingFlat, LinearEmbedding, LinearEmbeddingFlat, + MLPContEmbedding, + MLPContEmbeddingFlat, PLREmbedding, PLREmbeddingFlat, SoftEmbedding, @@ -73,6 +77,7 @@ from .torch_based.nn_models import MLP, TabNet from .torch_based.nn_models import NODE from .torch_based.nn_models import SNN +from .torch_based.nn_models import SAINT from .torch_based.nn_models import DenseLightModel from .torch_based.nn_models import DenseModel from .torch_based.nn_models import LinearLayer @@ -84,6 +89,8 @@ logger = logging.getLogger(__name__) +models_dependent_on_training_data = ["saint"] + model_by_name = { "denselight": DenseLightModel, "dense": DenseModel, @@ -96,7 +103,9 @@ "autoint": AutoInt, "tabnet": TabNet, "fttransformer": FTTransformer, + "saint":SAINT, } + input_type_by_name = { "denselight": "flat", "dense": "flat", @@ -109,6 +118,7 @@ "autoint": "seq", "tabnet": "flat", "fttransformer": "seq", + "saint": "seq", } cat_embedder_by_name_flat = { "cat": CatEmbedder, @@ -127,6 +137,7 @@ "dense": DenseEmbeddingFlat, "plr": PLREmbeddingFlat, "soft": SoftEmbeddingFlat, + "mlp": MLPContEmbeddingFlat } cont_embedder_by_name = { "cont": LinearEmbedding, @@ -134,6 +145,7 @@ "dense": DenseEmbedding, "plr": PLREmbedding, "soft": SoftEmbedding, + "mlp": MLPContEmbedding, } @@ -255,7 +267,7 @@ class TorchModel(TabularMLAlgo): **_default_models_params, } - def _infer_params(self): + def _infer_params(self, train = None): if self.params["path_to_save"] is not None: self.path_to_save = os.path.relpath(self.params["path_to_save"]) if not os.path.exists(self.path_to_save): @@ -304,6 +316,22 @@ def _infer_params(self): params[p_name] = getattr(module, params[p_name]) # params = self._select_params(params) + if params['model'] in models_dependent_on_training_data: + self.use_sampler = True + if train is not None: + self.train = train + else: + self.use_sampler = False + + self.train_params = { + "dataset": params["dataset"], + "bs": params["bs"], + "num_workers": params["num_workers"], + "pin_memory": params["pin_memory"], + "tokenizer": AutoTokenizer.from_pretrained(params["bert_name"], use_fast=False) if is_text else None, + "max_length": params["max_length"], + } + model = Trainer( net=TorchUniversalModel if not params["model_with_emb"] else params["model"], net_params={ @@ -314,10 +342,11 @@ def _infer_params(self): if is_cont else None, "cont_params": { - "num_dims": params["num_dims"], - "input_bn": params["input_bn"], - "device": params["device"], - "embedding_size": params["embedding_size"], + # "num_dims": params["num_dims"], + # "input_bn": params["input_bn"], + # "device": params["device"], + # "embedding_size": params["embedding_size"], + **params } if is_cont else None, @@ -327,13 +356,14 @@ def _infer_params(self): if is_cat else None, "cat_params": { - "cat_vc": params["cat_vc"], - "cat_dims": params["cat_dims"], - "emb_dropout": params["emb_dropout"], - "emb_ratio": params["emb_ratio"], - "max_emb_size": params["max_emb_size"], - "embedding_size": params["embedding_size"], - "device": params["device"], + # "cat_vc": params["cat_vc"], + # "cat_dims": params["cat_dims"], + # "emb_dropout": params["emb_dropout"], + # "emb_ratio": params["emb_ratio"], + # "max_emb_size": params["max_emb_size"], + # "embedding_size": params["embedding_size"], + # "device": params["device"], + **params } if is_cat else None, @@ -347,18 +377,11 @@ def _infer_params(self): "torch_model": torch_model, **params, }, - **{"apex": False, **params}, + + **{"apex": False, + **params}, ) - self.train_params = { - "dataset": params["dataset"], - "bs": params["bs"], - "num_workers": params["num_workers"], - "pin_memory": params["pin_memory"], - "tokenizer": AutoTokenizer.from_pretrained(params["bert_name"], use_fast=False) if is_text else None, - "max_length": params["max_length"], - } - return model @staticmethod @@ -485,7 +508,7 @@ def init_params_on_input(self, train_valid_iterator) -> dict: ) return suggested_params - def get_dataloaders_from_dicts(self, data_dict: Dict): + def get_dataloaders_from_dicts(self, data_dict: Dict, n : int =0): """Construct dataloaders depending on stage. Args: @@ -511,6 +534,7 @@ def get_dataloaders_from_dicts(self, data_dict: Dict): } datasets[stage] = self.train_params["dataset"]( + fold = n, data=data, y=value.target.values if stage != "test" else np.ones(len(value.data)), w=value.weights.values if value.weights is not None else np.ones(len(value.data)), @@ -551,8 +575,8 @@ def fit_predict(self, train_valid_iterator: TrainValidIterator) -> NumpyDataset: self.params = self.init_params_on_input(train_valid_iterator) self.params = self._init_params_on_input(train_valid_iterator) return super().fit_predict(train_valid_iterator) - - def fit_predict_single_fold(self, train, valid): + + def fit_predict_single_fold(self, train: TabularDataset, valid: TabularDataset, n=0): """Implements training and prediction on single fold. Args: @@ -568,14 +592,21 @@ def fit_predict_single_fold(self, train, valid): target = train.target self.params["bias"] = self.get_mean_target(target, task_name) if self.params["init_bias"] else None - model = self._infer_params() + if self.params['is_holdout']: + ds = train + else: + ds = vconcatenate([train,valid]) + model = self._infer_params(ds) model_path = ( os.path.join(self.path_to_save, f"{uuid.uuid4()}.pickle") if self.path_to_save is not None else None ) # init datasets - dataloaders = self.get_dataloaders_from_dicts({"train": train.to_pandas(), "val": valid.to_pandas()}) - + if self.use_sampler: + dataloaders = self.get_dataloaders_from_dicts({"train": train.to_pandas(), "val": valid.to_pandas(),"sampler": train.to_pandas()},n) + else: + dataloaders = self.get_dataloaders_from_dicts({"train": train.to_pandas(), "val": valid.to_pandas()},n) + dataloaders['sampler'] = None val_pred = model.fit(dataloaders) if model_path is None: @@ -601,12 +632,17 @@ def predict_single_fold(self, model: any, dataset: TabularDataset) -> np.ndarray """ seed_everything(self.params["random_state"], self.params["deterministic"]) - dataloaders = self.get_dataloaders_from_dicts({"test": dataset.to_pandas()}) + if self.use_sampler: + dataloaders = self.get_dataloaders_from_dicts({"test": dataset.to_pandas(),"sampler": self.train.to_pandas()}) + else: + dataloaders = self.get_dataloaders_from_dicts({"test": dataset.to_pandas()}) + dataloaders['sampler'] = None + if isinstance(model, (str, dict)): model = self._infer_params().load_state(model) - pred = model.predict(dataloaders["test"], "test") + pred = model.predict(dataloaders, "test") model.clean() del dataloaders, model diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index b895ee3b..0ee8c0b4 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -8,7 +8,9 @@ import numpy as np import torch import torch.nn as nn -from ..tabnet.utils import TabNetEncoder, _initialize_non_glu + +from .saint.saint_utils import ColTransformer, RowColTransformer +from .tabnet.utils import TabNetEncoder, _initialize_non_glu from .autoint.autoint_utils import AttnInteractionBlock, LeakyGate from .autoint.ghost_norm import GhostBatchNorm from .fttransformer.fttransformer_utils import Transformer @@ -153,9 +155,9 @@ def __init__( concat_input: bool = True, dropout_first: bool = True, bn_momentum: float = 0.1, - ghost_batch: Optional[int] = 64, - use_skip: bool = True, - leaky_gate: bool = True, + ghost_batch: Optional[int] = None, + use_skip: bool = False, + leaky_gate: bool = False, weighted_sum: bool = True, device: torch.device = torch.device("cuda:0"), **kwargs, @@ -827,6 +829,7 @@ class NODE(nn.Module): layer_dim: num trees in one layer. num_layers: number of forests. tree_dim: number of response channels in the response of individual tree. + choice_function: str `entmax` or `sparsmax` use_original_head use averaging as a head or put linear layer instead. depth: number of splits in every tree. drop_rate: Dropout rate for each layer altogether. @@ -842,6 +845,7 @@ def __init__( layer_dim: int = 2048, num_layers: int = 1, tree_dim: int = 1, + choice_function="entmax", use_original_head: bool = False, depth: int = 6, drop_rate: float = 0.0, @@ -860,6 +864,7 @@ def __init__( num_layers=num_layers, tree_dim=tree_dim if not use_original_head else n_out, depth=depth, + choice_function=choice_function, input_dropout=drop_rate, flatten_output=not use_original_head, ) @@ -1184,3 +1189,129 @@ def forward(self, x): def forward_masks(self, x): """Magic forward-pass of encoder that returns masks.""" return self.encoder.forward_masks(x) + + +class SAINT(nn.Module): + """Implementation of Saint from https://github.com/yandex-research/tabular-dl-tabr. + + Args: + n_in : int + Number of features + n_out : int or list of int for multi task classification + Dimension of network output + embedding_size : embedding_size + Dimension of the embedding + depth : int + Number of Attention Blocks. + heads : int + Number of heads in Attention. + dim_head : int + Attention head dimension. + mlp_hidden_mults : int | tuple[int] + Multiply hidden state of MLP. + ffn_mult : int + Multiply hidden state of feed forward layer. + attn_dropout : float + Post-Attention dropout. + ff_dropout : int + Feed-Forward Dropout. + mlp_dropout : float + MLP Dropout. + attentiontype : str + Either "colrow" or "row" : this is the masking attention to use + device : torch.device + kwargs : kwargs + """ + + def __init__( + self, + n_in: int, + n_out: int = 1, + embedding_size: int = 10, + depth: int = 2, + heads: int = 8, + dim_head=16, + mlp_hidden_mults=(4, 2), + ffn_mult=4, + attn_dropout=0.0, + ff_dropout=0.0, + mlp_dropout=0.0, + attentiontype="colrow", + pooling: str = "cls", + device: torch.device = torch.device("cuda:0"), + **kwargs, + ): + super().__init__() + self.device = device + self.cls_token = nn.Embedding(2, embedding_size) + self.attentiontype = attentiontype + if attentiontype == "col": + self.transformer = ColTransformer( + dim=embedding_size, + depth=depth, + heads=heads, + dim_head=dim_head, + attn_dropout=attn_dropout, + ff_dropout=ff_dropout, + ) + elif attentiontype in ["row", "colrow"]: + self.transformer = RowColTransformer( + dim=embedding_size, + nfeats=n_in + 1, # num featurs + depth=depth, + heads=heads, + dim_head=dim_head, + ffn_mult=ffn_mult, + attn_dropout=attn_dropout, + ff_dropout=ff_dropout, + style=attentiontype, + ) + + l_rate = (n_in + 1) // 8 # input_size = (dim * self.num_categories) + (dim * num_continuous) + hidden_dimensions = list(map(lambda t: l_rate * t, mlp_hidden_mults)) + self.pooling = pooling_by_name[pooling]() + self.mlp = MLP( + n_in=embedding_size * 2 if pooling == "concat" else embedding_size, + n_out=n_out, + hidden_size=hidden_dimensions, + drop_rate=mlp_dropout, + use_bn=False, + dropout_first=False, + ) + + def forward(self, embedded: torch.Tensor, bs: int) -> torch.Tensor: + """Transform the input tensor. + + Args: + embedded : torch.Tensor + embedded fields + bs : batch size without sapler`s part + + Returns: + torch.Tensor + + """ + mask = torch.zeros((len(embedded), len(embedded)), device=self.device, dtype=torch.bool) + mask[torch.arange(bs), torch.arange(bs)] = 1 + # NOTE that it was: + # mask[:bs, bs:] = 1 + # mask[bs:, bs:] = 1 + # probably misprint + mask[:bs, bs:] = 1 + mask[bs:, :bs] = 1 + + cls_token = torch.unsqueeze( + self.cls_token(torch.ones(embedded.shape[0], dtype=torch.int).to(self.device)), dim=1 + ) + x = torch.cat((cls_token, embedded), dim=1) + x = self.transformer(x, mask_samples=mask) + + # NOTE modified to simple X -> Y supervised model + + # cat_outs = self.mlp1(x[:,:self.num_categories,:]) + # con_outs = self.mlp2(x[:,self.num_categories:,:]) + # return cat_outs, con_outs + x_mask = torch.ones(x.shape, dtype=torch.bool).to(self.device) + pool_tokens = self.pooling(x=x, x_mask=x_mask) + logits = self.mlp(pool_tokens) + return logits diff --git a/lightautoml/ml_algo/torch_based/node_nn_model.py b/lightautoml/ml_algo/torch_based/node_nn_model.py index e57f5125..e3f3f6da 100644 --- a/lightautoml/ml_algo/torch_based/node_nn_model.py +++ b/lightautoml/ml_algo/torch_based/node_nn_model.py @@ -554,6 +554,7 @@ class DenseODSTBlock(nn.Sequential): max_features: maximum number of features per input depth: number of splits in every tree. input_dropout: Dropout rate forest layer. + choice_function: str `entmax` or `sparsmax`. flatten_output: flatten output or not. """ @@ -565,12 +566,23 @@ def __init__( tree_dim=1, max_features=None, input_dropout=0.0, + choice_function="entmax", flatten_output=True, **kwargs ): layers = [] + ch_f = Sparsemax() if choice_function == "sparsmax" else Entmax15() + bin_f = Sparsemoid() if choice_function == "sparsmax" else Entmoid15() for i in range(num_layers): - oddt = ODST(input_dim, layer_dim, tree_dim=tree_dim, flatten_output=True, **kwargs) + oddt = ODST( + input_dim, + layer_dim, + tree_dim=tree_dim, + flatten_output=True, + choice_function=ch_f, + bin_function=bin_f, + **kwargs + ) input_dim = min(input_dim + layer_dim * tree_dim, max_features or float("inf")) layers.append(oddt) diff --git a/lightautoml/ml_algo/torch_based/saint/saint_utils.py b/lightautoml/ml_algo/torch_based/saint/saint_utils.py new file mode 100644 index 00000000..d9dea227 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/saint/saint_utils.py @@ -0,0 +1,256 @@ +"""Saint utils.""" + +from einops import rearrange +from torch import einsum, nn + +from ..fttransformer.fttransformer_utils import GEGLU + + +class Residual(nn.Module): + """Residual connection layer. + + Args: + fn : function to apply + """ + + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x, **kwargs): + """Forward-pass.""" + return self.fn(x, **kwargs) + x + + +class PreNorm(nn.Module): + """Normalization connection layer. + + Args: + fn : function to apply + """ + + def __init__(self, dim, fn): + super().__init__() + self.norm = nn.LayerNorm(dim) + self.fn = fn + + def forward(self, x, **kwargs): + """Forward-pass.""" + return self.fn(self.norm(x), **kwargs) + + +# attention + + +class FeedForward(nn.Module): + """Feedforward for Transformer block. + + Args: + dim: Embeddings dimension. + mult: multiply hidden state dim. + dropout: Post-Attention dropout. + """ + + def __init__(self, dim, mult=4, dropout=0.0): + super().__init__() + self.net = nn.Sequential( + nn.Linear(dim, int(dim * mult) * 2), GEGLU(), nn.Dropout(dropout), nn.Linear(int(dim * mult), dim) + ) + + def forward(self, x, **kwargs): + """Forward-pass. + + Args: + x : torch.Tensor + 3-d tensor; for example, embedded numeric and/or categorical values, + or the output of a previous attention layer. + kwargs: kwargs + + Returns: + torch.Tensor + + """ + return self.net(x, **kwargs) + + +class Attention(nn.Module): + """Attention Block. + + Args: + dim: Embeddings dimension. + heads: Number of heads in Attention. + dim_head: Attention head dimension. + dropout: Post-Attention dropout. + """ + + def __init__(self, dim, heads=8, dim_head=16, dropout=0.0): + super().__init__() + inner_dim = dim_head * heads + self.heads = heads + self.scale = dim_head ** -0.5 + + self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False) + self.to_out = nn.Linear(inner_dim, dim) + + self.dropout = nn.Dropout(dropout) + + def forward(self, x, mask=None): + """Transform the input tensor with attention. + + Args: + x : torch.Tensor + 3-d tensor; for example, embedded numeric and/or categorical values, + or the output of a previous attention layer. + mask: torch.Tensor + + Returns: + torch.Tensor + + """ + h = self.heads + q, k, v = self.to_qkv(x).chunk(3, dim=-1) + q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v)) + sim = einsum("b h i d, b h j d -> b h i j", q, k) * self.scale + if mask is not None: + sim[~mask[None, None].expand_as(sim)] = float("-inf") + attn = sim.softmax(dim=-1) + out = einsum("b h i j, b h j d -> b h i d", attn, v) + out = rearrange(out, "b h n d -> b n (h d)", h=h) + return self.to_out(out) + + +class RowColTransformer(nn.Module): + """Transformer Block. + + Args: + dim: Embeddings dimension. + nfeats: Number of features. + depth: Number of Attention Blocks. + heads: Number of heads in Attention. + dim_head: Attention head dimension. + ffn_mult: multiply hidden state of feed forward layer. + attn_dropout: Post-Attention dropout. + ff_dropout: Feed-Forward Dropout. + style: attention style: 'col' or 'colrow' + """ + + def __init__(self, dim, nfeats, depth, heads, dim_head, ffn_mult, attn_dropout, ff_dropout, style="col"): + super().__init__() + self.layers = nn.ModuleList([]) + self.mask_embed = nn.Embedding(nfeats, dim) + self.style = style + for _ in range(depth): + if self.style == "colrow": + self.layers.append( + nn.ModuleList( + [ + PreNorm( + dim, Residual(Attention(dim, heads=heads, dim_head=dim_head, dropout=attn_dropout)) + ), + PreNorm(dim, Residual(FeedForward(dim, mult=ffn_mult, dropout=ff_dropout))), + PreNorm( + dim * nfeats, + Residual(Attention(dim * nfeats, heads=heads, dim_head=dim_head, dropout=attn_dropout)), + ), + PreNorm( + dim * nfeats, Residual(FeedForward(dim * nfeats, mult=ffn_mult, dropout=ff_dropout)) + ), + ] + ) + ) + else: + self.layers.append( + nn.ModuleList( + [ + PreNorm( + dim * nfeats, + Residual(Attention(dim * nfeats, heads=heads, dim_head=64, dropout=attn_dropout)), + ), + PreNorm( + dim * nfeats, Residual(FeedForward(dim * nfeats, mult=ffn_mult, dropout=ff_dropout)) + ), + ] + ) + ) + + def forward(self, x, mask_features=None, mask_samples=None): + """Transform the input embeddings tensor with Transformer module. + + Args: + x : torch.Tensor + 3-d tensor; embedded numeric and/or categorical values, + or the output of a previous Transformer layer. + mask_features: torch.Tensor + mask for the first attention + mask_samples: torch.Tensor + mask for the second attention + + Returns: + torch.Tensor + + """ + _, n, _ = x.shape + if self.style == "colrow": + for attn1, ff1, attn2, ff2 in self.layers: # type: ignore[code] + x = attn1(x, mask=mask_features) + x = ff1(x) + x = rearrange(x, "b n d -> 1 b (n d)") + x = attn2(x, mask=mask_samples) + x = ff2(x) + x = rearrange(x, "1 b (n d) -> b n d", n=n) + else: + for attn1, ff1 in self.layers: # type: ignore[code] + x = rearrange(x, "b n d -> 1 b (n d)") + x = attn1(x) + x = ff1(x) + x = rearrange(x, "1 b (n d) -> b n d", n=n) + return x + + +# transformer +class ColTransformer(nn.Module): + """Transformer Block. + + Args: + dim: Embeddings dimension. + depth: Number of Attention Blocks. + heads: Number of heads in Attention. + dim_head: Attention head dimension. + attn_dropout: Post-Attention dropout. + ff_dropout: Feed-Forward Dropout. + """ + + def __init__(self, dim, depth, heads, dim_head, attn_dropout, ff_dropout): + super().__init__() + self.layers = nn.ModuleList([]) + + for _ in range(depth): + self.layers.append( + nn.ModuleList( + [ + PreNorm(dim, Residual(Attention(dim, heads=heads, dim_head=dim_head, dropout=attn_dropout))), + PreNorm(dim, Residual(FeedForward(dim, dropout=ff_dropout))), + ] + ) + ) + + def forward(self, x, mask_features=None, mask_samples=None): + """Transform the input embeddings tensor with Transformer module. + + Args: + x : torch.Tensor + 3-d tensor; embedded numeric and/or categorical values, + or the output of a previous Transformer layer. + mask_features: torch.Tensor + not used + mask_samples: torch.Tensor + not used + + Returns: + torch.Tensor + + """ + for attn, ff in self.layers: + x = attn(x) + x = ff(x) + return x diff --git a/lightautoml/ml_algo/tabnet/utils.py b/lightautoml/ml_algo/torch_based/tabnet/utils.py similarity index 99% rename from lightautoml/ml_algo/tabnet/utils.py rename to lightautoml/ml_algo/torch_based/tabnet/utils.py index 8530be5d..dc2f9d75 100644 --- a/lightautoml/ml_algo/tabnet/utils.py +++ b/lightautoml/ml_algo/torch_based/tabnet/utils.py @@ -2,8 +2,8 @@ import torch import numpy as np import torch.nn as nn -from ..torch_based.node_nn_model import Entmax15, Sparsemax -from ..torch_based.autoint.ghost_norm import GhostBatchNorm +from ..node_nn_model import Entmax15, Sparsemax +from ..autoint.ghost_norm import GhostBatchNorm def _initialize_non_glu(module, input_dim, output_dim): diff --git a/lightautoml/text/embed.py b/lightautoml/text/embed.py index 0fbe062d..b4974d3d 100644 --- a/lightautoml/text/embed.py +++ b/lightautoml/text/embed.py @@ -705,3 +705,72 @@ class SoftEmbeddingFlat(SoftEmbedding): def __init__(self, *args, **kwargs): super(SoftEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) + + +class MLPContEmbedding(nn.Module): + """MLP multi-dim embedding. + + Args: + num_dims : num of features. + d_in: input size. + d_out: output size. + d_hidden: hidden size. + """ + + def __init__( + self, + num_dims: int, + embedding_size: int = 10, + d_hidden: int = 64, + flatten_output: bool = False, + **kwargs, + ) -> None: + super().__init__() + self.flatten_output = flatten_output + self.embedding_size = embedding_size + self.num_dims = num_dims + self.layers = nn.ModuleList( + [ + nn.Sequential(nn.Linear(1, d_hidden), nn.ReLU(), nn.Linear(d_hidden, embedding_size)) + for _ in range(num_dims) + ] + ) + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + if self.flatten_output: + return self.num_dims * self.embedding_size + else: + return self.num_dims + + def forward(self, X: Dict) -> Tensor: + """Produce embedding for each value in input. + + Args: + X : Dict + + Returns: + torch.Tensor + + """ + x = X["cont"] + # ans = [] + # for i, l in enumerate(self.layers): + # temp = x[:,i].view(x.size(0),-1) + # temp = l(temp) + # x = torch.stack(ans,1) + x = torch.stack([l(x[:, i].view(-1,1)) for i, l in enumerate(self.layers)], 1) + if self.flatten_output: + return x.view(x.shape[0], -1) + return x + +class MLPContEmbeddingFlat(MLPContEmbedding): + """Flatten version of BasicCatEmbedding.""" + + def __init__(self, *args, **kwargs): + super(MLPContEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) \ No newline at end of file diff --git a/lightautoml/text/nn_model.py b/lightautoml/text/nn_model.py index 58fa1574..131901d1 100644 --- a/lightautoml/text/nn_model.py +++ b/lightautoml/text/nn_model.py @@ -12,7 +12,7 @@ import torch.nn as nn from ..tasks.base import Task - +from .utils import _dtypes_mapping logger = logging.getLogger(__name__) @@ -31,6 +31,7 @@ class UniversalDataset: def __init__( self, + fold: int, data: Dict[str, np.ndarray], y: np.ndarray, w: Optional[np.ndarray] = None, @@ -38,6 +39,7 @@ def __init__( max_length: int = 256, stage: str = "test", ): + self.fold = fold self.data = data self.y = y self.w = w @@ -49,7 +51,7 @@ def __len__(self) -> int: return len(self.y) def __getitem__(self, index: int) -> Dict[str, np.ndarray]: - res = {"label": self.y[index]} + res = {"fold":self.fold ,"label": self.y[index]} res.update({key: value[index] for key, value in self.data.items() if key != "text"}) if (self.tokenizer is not None) and ("text" in self.data): sent = self.data["text"][index, 0] # only one column @@ -85,7 +87,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward-pass.""" x = torch.clamp(x, self.min_v, self.max_v) return x - + class TorchUniversalModel(nn.Module): """Mixed data model. @@ -133,6 +135,7 @@ def __init__( self.cont_embedder = None self.cat_embedder = None self.text_embedder = None + self.sampler = None n_in = 0 if cont_embedder_ is not None: @@ -169,11 +172,8 @@ def __init__( self.softmax = nn.Softmax(dim=1) def _set_last_layer(self, torch_model, bias): - try: - use_skip = torch_model.use_skip - self._init_last_layers(torch_model, bias, use_skip) - except: - self._init_last_layers(torch_model, bias, False) + use_skip = getattr(torch_model, "use_skip", False) + self._init_last_layers(torch_model, bias, use_skip) def _init_last_layers(self, torch_model, bias, use_skip=False): try: @@ -215,9 +215,12 @@ def _init_last_layers(self, torch_model, bias, use_skip=False): except: logger.info3("Last linear layer not founded, so init_bias=False") - def get_logits(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: + + def get_logits(self, inp: Dict[str, torch.Tensor],efficient_bs:int = None) -> torch.Tensor: """Forward-pass of model with embeddings.""" outputs = [] + + if self.cont_embedder is not None: outputs.append(self.cont_embedder(inp)) @@ -231,8 +234,10 @@ def get_logits(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: output = torch.cat(outputs, dim=1) else: output = outputs[0] - - logits = self.torch_model(output) + if efficient_bs is not None: + logits = self.torch_model(output,efficient_bs) + else: + logits = self.torch_model(output) return logits def get_preds_from_logits(self, logits: torch.Tensor) -> torch.Tensor: @@ -249,7 +254,16 @@ def get_preds_from_logits(self, logits: torch.Tensor) -> torch.Tensor: def forward(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: """Forward-pass with output loss.""" - x = self.get_logits(inp) + efficient_bs = None + if inp['sampler'] is not None: + efficient_bs = len(inp['label']) + candidate_sample = next(inp['sampler']) + inp = { + i: torch.cat([inp[i], + (candidate_sample[i].long().to(self.torch_model.device) if _dtypes_mapping[i] == "long" else candidate_sample[i].to(self.torch_model.device))]) + for i in set(inp.keys())-set(['sampler']) + } + x = self.get_logits(inp,efficient_bs) if not self.loss_on_logits: x = self.get_preds_from_logits(x) @@ -258,6 +272,15 @@ def forward(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: def predict(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: """Prediction.""" - x = self.get_logits(inp) + efficient_bs = None + if inp['sampler'] is not None: + efficient_bs = len(inp['label']) + candidate_sample = next(inp['sampler']) + inp = { + i: torch.cat([inp[i], + (candidate_sample[i].long().to(self.torch_model.device) if _dtypes_mapping[i] == "long" else candidate_sample[i].to(self.torch_model.device))]) + for i in set(inp.keys())-set(['sampler']) + } + x = self.get_logits(inp,efficient_bs) x = self.get_preds_from_logits(x) return x diff --git a/lightautoml/text/trainer.py b/lightautoml/text/trainer.py index 240af7e0..d8a3bf73 100644 --- a/lightautoml/text/trainer.py +++ b/lightautoml/text/trainer.py @@ -237,6 +237,28 @@ def load_state_dict(self, weights: Dict, model: nn.Module): return self +class InfIterator(object): + """Infinite Iterator. + + Args: + dataloader : torch.utils.dataloader + """ + + def __init__(self, dataloader): + self.dl = dataloader + self.it = iter(self.dl) + + def __iter__(self): + return self + + def __next__(self): + try: + return next(self.it) + except StopIteration: + self.it = iter(self.dl) + return next(self.it) + + class Trainer: """Torch main trainer class. @@ -436,7 +458,8 @@ def fit(self, dataloaders: Dict[str, DataLoader]) -> np.ndarray: train_loss = self.train(dataloaders=dataloaders) train_log.extend(train_loss) # test - val_loss, val_data, weights = self.test(dataloader=dataloaders["val"]) + + val_loss, val_data, weights = self.test(dataloaders=dataloaders) if self.stop_by_metric: cond = -1 * self.metric(*val_data, weights) else: @@ -461,14 +484,14 @@ def fit(self, dataloaders: Dict[str, DataLoader]) -> np.ndarray: self.se.set_best_params(self.model) if self.is_snap: - val_loss, val_data, weights = self.test(dataloader=dataloaders["val"], snap=True, stage="val") + val_loss, val_data, weights = self.test(dataloaders=dataloaders, snap=True, stage="val") logger.info3( "Result SE, val loss: {vl}, val metric: {me}".format( me=self.metric(*val_data, weights), vl=np.mean(val_loss) ) ) elif self.se.swa: - val_loss, val_data, weights = self.test(dataloader=dataloaders["val"]) + val_loss, val_data, weights = self.test(dataloaders=dataloaders) logger.info3( "Early stopping: val loss: {vl}, val metric: {me}".format( me=self.metric(*val_data, weights), vl=np.mean(val_loss) @@ -489,6 +512,7 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: Loss. """ + ################## loss_log = [] self.model.train() running_loss = 0 @@ -499,13 +523,20 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: loader = tqdm(dataloaders["train"], desc="train", disable=False) else: loader = dataloaders["train"] - + sampler = None + if dataloaders["sampler"] is not None: + # data['batch_size'] = len(sample['label']) + sampler = InfIterator(dataloaders["sampler"]) for sample in loader: data = { i: (sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)) for i in sample.keys() } - + # data['batch_size'] = len(sample['label']) + # if dataloaders['sampler'] is not None: + # # data['batch_size'] = len(sample['label']) + # data['sampler'] = dataloaders['sampler'] + data["sampler"] = sampler loss = self.model(data).mean() if self.apex: with self.amp.scale_loss(loss, self.optimizer) as scaled_loss: @@ -525,7 +556,7 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: c += 1 if self.verbose and self.verbose_bar and logging_level < logging.INFO: if self.verbose_inside and c % self.verbose_inside == 0: - val_loss, val_data, weights = self.test(dataloader=dataloaders["val"]) + val_loss, val_data, weights = self.test(dataloaders=dataloaders) if self.stop_by_metric: cond = -1 * self.metric(*val_data, weights) else: @@ -545,12 +576,12 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: return loss_log def test( - self, dataloader: DataLoader, stage: str = "val", snap: bool = False + self, dataloaders: DataLoader, stage: str = "val", snap: bool = False ) -> Tuple[List[float], Tuple[np.ndarray, np.ndarray]]: """Testing loop. Args: - dataloader: Torch dataloader. + dataloaders: Torch dataloader. stage: Train, val or test. snap: Use snapshots. @@ -558,6 +589,7 @@ def test( Loss, (Target, OOF). """ + ##################### loss_log = [] weights_log = [] self.model.eval() @@ -565,17 +597,21 @@ def test( target = [] logging_level = get_stdout_level() if logging_level < logging.INFO and self.verbose and self.verbose_bar: - loader = tqdm(dataloader, desc=stage, disable=False) + loader = tqdm(dataloaders[stage], desc=stage, disable=False) else: - loader = dataloader - + loader = dataloaders[stage] + sampler = None + if dataloaders["sampler"] is not None: + # data['batch_size'] = len(sample['label']) + sampler = InfIterator(dataloaders["sampler"]) with torch.no_grad(): for sample in loader: data = { - i: (sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)) + i: sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device) for i in sample.keys() } - + data["sampler"] = sampler + # NOTE, HERE WE CAN ADD TORCH.UNIQUE if snap: output = self.se.predict(data) loss = self.se.forward(data) if stage != "test" else None @@ -588,11 +624,11 @@ def test( loss_log.append(loss) - output = output.data.cpu().numpy() - target_data = data["label"].data.cpu().numpy() + output = output.data.cpu().numpy()[: len(sample["label"])] + target_data = data["label"].data.cpu().numpy()[: len(sample["label"])] weights = data.get("weight", None) if weights is not None: - weights = weights.data.cpu().numpy() + weights = weights.data.cpu().numpy()[: len(sample["label"])] pred.append(output) target.append(target_data) @@ -609,16 +645,16 @@ def test( np.array(weights_log), ) - def predict(self, dataloader: DataLoader, stage: str) -> np.ndarray: + def predict(self, dataloaders: DataLoader, stage: str) -> np.ndarray: """Predict model. Args: - dataloader: Torch dataloader. + dataloaders: Torch dataloader. stage: Train, val or test. Returns: Prediction. """ - loss, (target, pred), _ = self.test(stage=stage, snap=self.is_snap, dataloader=dataloader) + loss, (target, pred), _ = self.test(stage=stage, snap=self.is_snap, dataloaders=dataloaders) return pred diff --git a/lightautoml/text/utils.py b/lightautoml/text/utils.py index 18a8fe70..d1cc3d0d 100644 --- a/lightautoml/text/utils.py +++ b/lightautoml/text/utils.py @@ -23,6 +23,7 @@ "token_type_ids": "long", "text": "float", # embeddings "length": "long", + "fold": "long" } @@ -66,7 +67,7 @@ def is_shuffle(stage: str) -> bool: Bool value. """ - is_sh = {"train": True, "val": False, "test": False} + is_sh = {"train": True, "val": False, "test": False, "sampler": True} return is_sh[stage]