diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index 8b59aa6a9..d8de58951 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -288,11 +288,10 @@ def main(**kwargs) -> None: --model_name "meta-llama/Llama-3.2-1B" \\ --lr 5e-4 """ - # TODO:Remove TrainConfig() and update_config() as all params are passed in kwargs by parser train_config = TrainConfig() update_config(train_config, **kwargs) - dataset_config = generate_dataset_config(train_config.dataset) - update_config(dataset_config, **kwargs) + custom_dataset_config_file = kwargs.pop("custom_dataset_config", None) + dataset_config = generate_dataset_config(train_config.dataset, custom_dataset_config_file) logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level) diff --git a/QEfficient/finetune/configs/dataset_config.py b/QEfficient/finetune/configs/dataset_config.py index 1f4fe094b..a895e727c 100644 --- a/QEfficient/finetune/configs/dataset_config.py +++ b/QEfficient/finetune/configs/dataset_config.py @@ -41,7 +41,5 @@ class imdb_dataset: @dataclass class custom_dataset: dataset: str = "custom_dataset" - file: str = "dataset/custom_dataset.py" train_split: str = "train" test_split: str = "validation" - data_path: str = "" diff --git a/QEfficient/finetune/configs/sample_peft_config.json b/QEfficient/finetune/configs/sample_peft_config.json new file mode 100644 index 000000000..c53c9c9dd --- /dev/null +++ b/QEfficient/finetune/configs/sample_peft_config.json @@ -0,0 +1,17 @@ +{ + "r": 32, + "lora_alpha": 64, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "up_proj", + "down_proj", + "gate_proj" + ], + "bias": "none", + "task_type": "CAUSAL_LM", + "lora_dropout": 0.05, + "inference_mode": false +} \ No newline at end of file diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index 4a1f500e3..ef76e83ed 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -6,6 +6,7 @@ # ----------------------------------------------------------------------------- import importlib +import logging from pathlib import Path from QEfficient.finetune.utils.logging_utils import logger @@ -26,18 +27,28 @@ def load_module_from_py_file(py_file: str) -> object: def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None): - if ":" in dataset_config.file: - module_path, func_name = dataset_config.file.split(":") - else: - module_path, func_name = dataset_config.file, "get_custom_dataset" + if not hasattr(dataset_config, "preproc_file"): + logger.raise_error("Can not find preproc_file key in dataset_config file.", RuntimeError) + + if ":" not in dataset_config.preproc_file: + logger.raise_error( + "The 'preproc_file' key in dataset_config file should follow the format: python_file_path:function_name", + RuntimeError, + ) + + module_path, func_name = dataset_config.preproc_file.split(":") + logger.log_rank_zero( + f"Using '{func_name}' function from {module_path} as preprocessing function in dataset preprocessing.", + logging.DEBUG, + ) if not module_path.endswith(".py"): - logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError) + logger.raise_error(f"Custom dataset preprocessing file {module_path} is not a .py file.", ValueError) module_path = Path(module_path) if not module_path.is_file(): logger.raise_error( - f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError + f"Custom dataset file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError ) module = load_module_from_py_file(module_path.as_posix()) @@ -45,24 +56,43 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non return getattr(module, func_name)(dataset_config, tokenizer, split, context_length) except AttributeError: logger.raise_error( - f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).", + f"For custom dataset preprocessing, the method ({func_name}) is not " + f"present in the file ({module_path.as_posix()}).", AttributeError, ) def get_data_collator(dataset_processer, dataset_config): - if ":" in dataset_config.file: - module_path, func_name = dataset_config.file.split(":") + if not hasattr(dataset_config, "collate_file"): + logger.log_rank_zero( + "Can not find collate_file key in dataset_config file. Using the default data collator function instead.", + logging.WARNING, + ) + return None + + if ":" not in dataset_config.collate_file: + logger.log_rank_zero( + "Can not find function name in 'collate_file' key in dataset_config " + "file. Using the default data collator function instead. If this is " + "not intended then change the format of the 'collate_file' key in " + "dataset_config file to follow the format: python_file_path:function_name", + logging.WARNING, + ) + return None else: - module_path, func_name = dataset_config.file, "get_data_collator" + module_path, func_name = dataset_config.collate_file.split(":") + logger.log_rank_zero( + f"Using '{func_name}' function from {module_path} as collate_fn in dataset preprocessing.", + logging.DEBUG, + ) if not module_path.endswith(".py"): - logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError) + logger.raise_error(f"Custom dataset collate file {module_path} is not a .py file.", ValueError) module_path = Path(module_path) if not module_path.is_file(): logger.raise_error( - f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError + f"Custom dataset collate file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError ) module = load_module_from_py_file(module_path.as_posix()) @@ -70,7 +100,8 @@ def get_data_collator(dataset_processer, dataset_config): return getattr(module, func_name)(dataset_processer) except AttributeError: logger.log_rank_zero( - f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()})." + f"Can not find the function {func_name} in file " + f"({module_path.as_posix()}). Using the default data collator " + "function instead." ) - logger.log_rank_zero("Using the default data_collator instead.") return None diff --git a/QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json new file mode 100644 index 000000000..2f6be9bc7 --- /dev/null +++ b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json @@ -0,0 +1,7 @@ +{ + "train_split": "train", + "test_split": "test", + "test_split_ratio": 0.15, + "preproc_file": "./QEfficient/finetune/dataset/custom_dataset/disc_preproc.py:get_preprocessed_disc", + "disc_style": "sarcasm_more" +} \ No newline at end of file diff --git a/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py new file mode 100644 index 000000000..78db5674c --- /dev/null +++ b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py @@ -0,0 +1,87 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + + +import datasets +from transformers.data import DataCollatorForSeq2Seq + + +def get_data_collator(tokenizer): + return DataCollatorForSeq2Seq(tokenizer) + + +def get_preprocessed_disc(dataset_config, tokenizer, split, context_length=None): + dataset = datasets.load_dataset("hallisky/DiSC") + + # Considering 'train' split as this dataset has only one split. + dataset = dataset["train"] + + test_split_ratio = dataset_config.test_split_ratio + disc_style = dataset_config.disc_style + + # Only collect the samples for a given style. + available_styles = set(dataset["category"]) + if disc_style not in available_styles: + raise RuntimeError(f"For DiSC dataset the provided disc_style '{disc_style}' is not supported.") + + dataset = dataset.filter(lambda example: example["category"] == disc_style) + + # Shuffle the dataset before splitting + dataset = dataset.shuffle(seed=42) + + # Split the data in train and test split. + total_samples = len(dataset) + test_size = int(total_samples * test_split_ratio) + train_size = total_samples - test_size + + if split == "test": + indices = range(train_size, total_samples) + else: + indices = range(0, train_size) + + dataset = dataset.select(indices) + + if tokenizer.pad_token is None: + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + + # Below is the template of the DiSC dataset. + # ### Original:{original} \n ### Rewrite: {rewrite} + template = "### Original:{original} \n ### Rewrite: " + + def apply_prompt_template(sample): + return { + "input": template.format(original=sample["original"]), + "label": sample["generation"], + } + + dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features)) + + def tokenize_add_label(sample): + input = tokenizer.encode( + tokenizer.bos_token + sample["input"], + add_special_tokens=False, + max_length=context_length, + pad_to_max_length=True, + ) + label = tokenizer.encode( + sample["label"] + tokenizer.pad_token + tokenizer.eos_token, + add_special_tokens=False, + max_length=context_length, + pad_to_max_length=True, + ) + + sample = { + "input_ids": (input + label), + "attention_mask": [1] * (len(input) + len(label)), + "labels": [-100] * len(input) + label, + } + + return sample + + dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features)) + + return dataset diff --git a/QEfficient/finetune/dataset/dataset_config.py b/QEfficient/finetune/dataset/dataset_config.py index 2e477be77..b766e923c 100644 --- a/QEfficient/finetune/dataset/dataset_config.py +++ b/QEfficient/finetune/dataset/dataset_config.py @@ -5,7 +5,6 @@ # # ----------------------------------------------------------------------------- -from functools import partial from QEfficient.finetune.dataset.alpaca_dataset import ( InstructionDataset as get_alpaca_dataset, @@ -23,7 +22,7 @@ ) DATASET_PREPROC = { - "alpaca_dataset": partial(get_alpaca_dataset), + "alpaca_dataset": get_alpaca_dataset, "grammar_dataset": get_grammar_dataset, "gsm8k_dataset": get_gsm8k_dataset, "custom_dataset": get_custom_dataset, diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py index 64f17fecb..0c8b3d827 100644 --- a/QEfficient/finetune/utils/config_utils.py +++ b/QEfficient/finetune/utils/config_utils.py @@ -8,13 +8,14 @@ import inspect import json import os +from collections import namedtuple from dataclasses import asdict -from typing import Any, Dict +from typing import Any, Dict, Optional import yaml from peft import LoraConfig as PeftLoraConfig -import QEfficient.finetune.configs.dataset_config as datasets +import QEfficient.finetune.configs.dataset_config as qeff_datasets from QEfficient.finetune.configs.peft_config import LoraConfig from QEfficient.finetune.configs.training import TrainConfig from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC @@ -86,11 +87,14 @@ def generate_peft_config(train_config: TrainConfig, **kwargs) -> Any: return peft_config -def generate_dataset_config(dataset_name: str) -> Any: +def generate_dataset_config(dataset_name: str, custom_dataset_config: Optional[str] = None) -> Any: """Generate a dataset configuration based on the specified dataset. Args: dataset_name (str): Name of the dataset to be used for finetuning. + custom_dataset_config (str): Dataset config json file for custom datset. + This file contains dataset specific arguments to be used in dataset + preprocessing step. Returns: Any: A dataset configuration object. @@ -101,7 +105,20 @@ def generate_dataset_config(dataset_name: str) -> Any: supported_datasets = DATASET_PREPROC.keys() assert dataset_name in supported_datasets, f"Given dataset '{dataset_name}' is not supported." # FIXME (Meet): Replace below logic by creating using auto registry of datasets. - dataset_config = {k: v for k, v in inspect.getmembers(datasets)}[dataset_name]() + dataset_config = {k: v for k, v in inspect.getmembers(qeff_datasets)}[dataset_name]() + if dataset_name == "custom_dataset": + if custom_dataset_config is None: + logger.raise_error( + "For 'custom_dataset', please provide dataset config file via 'custom_dataset_config' flag.", + RuntimeError, + ) + custom_dataset_dict = asdict(dataset_config) + custom_dataset_dict_override = load_config_file(custom_dataset_config) + # Override existing and add new params to dataset_config. + custom_dataset_dict.update(custom_dataset_dict_override) + + custom_dataset_class = namedtuple("custom_dataset", custom_dataset_dict.keys()) + dataset_config = custom_dataset_class(**custom_dataset_dict) return dataset_config diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py index 0fb325c8a..d3a7e7185 100644 --- a/QEfficient/finetune/utils/dataset_utils.py +++ b/QEfficient/finetune/utils/dataset_utils.py @@ -64,8 +64,9 @@ def get_dataloader_kwargs(train_config, dataset, dataset_processer, split): kwargs["drop_last"] = False else: kwargs["batch_size"] = batch_size - kwargs["drop_last"] = False - kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer) + kwargs["drop_last"] = True + # todo: -100 should be changed to a variable. or tokenizer.pad_token_id + kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer, label_pad_token_id=-100) return kwargs diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py index ad53ae35d..18dae8ec2 100644 --- a/QEfficient/finetune/utils/parser.py +++ b/QEfficient/finetune/utils/parser.py @@ -43,6 +43,13 @@ def get_finetune_parser(): default=None, help="Name of the tokenizer,if not passed as an argument, it uses the value of model_name", ) + parser.add_argument( + "--custom_dataset_config", + "--custom-dataset-config", + type=str, + default=None, + help="Path of custom dataset config json file to override the custom dataset params such as test_split_ratio, test_split etc.", + ) parser.add_argument( "--run_validation", "--run-validation", diff --git a/docs/source/finetune.md b/docs/source/finetune.md index be8dfde00..311605709 100644 --- a/docs/source/finetune.md +++ b/docs/source/finetune.md @@ -75,38 +75,82 @@ tensorboard --logdir runs/ --bind_all 1) Gradient accumulation: By default, gradient accumulation happens for 4 steps. To update this value, command line argument gradient_accumulation_steps has to be passed. (Example: '--gradient_accumulation_steps 8') 2) Gradient Checkpointing: By default, gradient checkpointing is disabled. To enable it, command line argument gradient_accumulation_steps has to be passed. -## Fine-Tuning on custom dataset -To run fine tuning for any user specific dataset, prepare the dataset using the following steps: - -1. Create a directory named 'dataset' inside efficient-transformers. -2. Inside this directory, create a file named 'custom_dataset.py'. -3. Inside the newly created efficient-transformers/dataset/custom_dataset.py, define a function named 'get_custom_dataset'. -4. get_custom_dataset() should have following 4 parameters: dataset_config, tokenizer, split, context_length. -5. Inside get_custom_dataset(), user needs to apply prompt and tokenize the dataset accordingly. Please refer the below template on how to define get_custom_dataset(). -6. For examples, please refer python files present in [dataset](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset). -7. In [dataset_config.py](https://github.com/quic/efficient-transformers/blob/main/QEfficient/finetune/configs/dataset_config.py), for custom_dataset class, pass the appropriate value for train_split and test_split. As an alternative, these values can be passed as command line arguments as well with the finetune command. For example "--train_split train". -8. While running fine tuning, pass argument "-–dataset custom_dataset" to finetune on custom dataset. - -Template for get_custom_dataset() to be defined inside efficient-transformers/dataset/custom_dataset.py is as follows: - -```python -def get_custom_dataset(dataset_config, tokenizer, split, context_length=None): - - # load dataset - # based on split, retrieve only the specific portion of the dataset (train or eval) either here or at the last - - def apply_prompt_template(): - # transform the passed datapoint by applying the prompt on it - - def tokenize(): - # tokenize the passed datapoint - - # define the prompt - # call apply_prompt_template() for each data point: - # dataset = dataset.map(apply_prompt_template ,) - # call tokenize() for each data point: - # dataset = dataset.map(tokenize, ) - - return dataset -``` +### đŸ”§ Steps to Fine-Tune with a Custom Dataset + +1. **Launching Fine-Tuning with a Custom Dataset** + - Use the following command-line arguments to begin fine-tuning using a custom dataset: + ```bash + --dataset custom_dataset --dataset_config data_config.json + ``` + - The `--dataset_config` argument is mandatory when `--dataset custom_dataset` is specified. The `data_config.json` file contains essential parameters used during dataset preprocessing. + + __Example `data_config.json` File__ + ```json + { + "train_split": "train", + "test_split": "test", + "test_split_ratio": 0.15, + "preproc_file": "sample_dataset_preproc.py:preprocessing_fn", + "collate_file": "sample_dataset_preproc.py:data_collate_fn", + "disc_style": "sarcasm_more" + } + ``` + +2. **Specifying the Preprocessing Function** + - In `data_config.json`, include a `"preproc_file"` mandatory key to define the path to your preprocessing Python file and the function within it. + - Use the format `"filename.py:function_name"`. The filename and function name both are required. + _Example:_ + ```json + "preproc_file": "sample_dataset_preproc.py:preprocessing_fn" + ``` + - The preprocessing function must follow the structure below. The function parameters and the return type of the function should not be altered. The sample illustrates `apply_prompt_template` and `tokenize` as sub-functions, but we can define our own sub-functions as needed. For reference, check the example files in the [./QEfficient/finetune/dataset/](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset) directory. + ```python + def preprocessing_fn(dataset_config, tokenizer, split, context_length=None): + # Load the dataset or read from the disk + # ... + + # Split the dataset into train and test splits if needed, + # and use the appropriate split based on the 'split' argument. + # ... + + def apply_prompt_template(example): + # Apply prompt formatting to each datapoint (e.g., example) + # ... + return example # Return the processed example + + def tokenize(example): + # Tokenize the formatted datapoint (e.g., example) + # ... + return tokenizer(example["text"], truncation=True, max_length=context_length) # Example tokenization + + # Apply prompt template to preprocess it in accordance to the dataset and task. + dataset = dataset.map(apply_prompt_template, ...) + + # Finally, tokenize the dataset + dataset = dataset.map(tokenize, batched=True, remove_columns=['text']) # Example batched tokenization + + # Each sample in the dataset should have keys acceptable by the HF + # model and the loss function. + # Typically, for CausalLM models used with 'generation' task_mode, + # the keys should be 'input_ids', 'attention_mask', and 'labels'. + return dataset + ``` + - In the sample preprocessing function above, the `split` variable takes its value from `data_config.json`. For the training dataset, the value will be taken from the `"train_split"` key, and for the evaluation/test dataset, it will be taken from the `"test_split"` key. + - Additional arguments needed for the preprocessing function can be passed in `data_config.json` and will be available via the `dataset_config` variable within the function. For instance, in the sample config above, `"test_split_ratio"` and `"disc_style"` keys can be used in the preprocessing function to define the test split ratio and style of the dataset. These values are accessed through the `dataset_config` variable. Check out the sample preprocessing file at [./QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py). + +3. **Custom Collate Function for Batching** + - When using a batch size greater than 1, we may need to override the default collate (batching different samples together in a batch) behavior by including a `"collate_file"` key in `data_config.json`. + - Use the same `"file.py:function"` format. If omitted, the default Hugging Face `DataCollatorForSeq2Seq` is typically used, which pads sequences to the longest length in the batch. + - A custom collate function must follow the structure below. The function parameters and the return type of the function should not be altered: + ```python + def get_data_collator(tokenizer): + # Define and return a custom collate_fn here + # ... + + # This function should take a list of samples and return a batch. + # Example: + # from transformers import DataCollatorForLanguageModeling + # return DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + ``` +---