From a8e6a4a72fc9985a80047f548294df2e3693b2a3 Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Fri, 27 Jun 2025 21:17:54 +0530 Subject: [PATCH 1/8] Updated handling of custom dataset in FT. Updated finetune.md readme accordingly. Signed-off-by: meetkuma --- QEfficient/cloud/finetune.py | 15 +-- QEfficient/finetune/configs/dataset_config.py | 2 - QEfficient/finetune/dataset/custom_dataset.py | 45 ++++++--- QEfficient/finetune/utils/config_utils.py | 20 +++- QEfficient/finetune/utils/helper.py | 11 +++ QEfficient/finetune/utils/parser.py | 14 +++ docs/source/finetune.md | 97 ++++++++++++------- 7 files changed, 144 insertions(+), 60 deletions(-) diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index 8b59aa6a9..ac69809a5 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -21,6 +21,7 @@ from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer from QEfficient.finetune.configs.training import TrainConfig +from QEfficient.finetune.utils.helper import parse_unk_args from QEfficient.finetune.utils.config_utils import ( generate_dataset_config, generate_peft_config, @@ -133,7 +134,7 @@ def load_model_and_tokenizer( model = AutoModelForSequenceClassification.from_pretrained( pretrained_model_path, num_labels=dataset_config.num_labels, - attn_implementation="sdpa", + attn_implementation="eager", torch_dtype=torch.float16, ) @@ -151,7 +152,7 @@ def load_model_and_tokenizer( model = AutoModelForCausalLM.from_pretrained( pretrained_model_path, use_cache=False, - attn_implementation="sdpa", + attn_implementation="eager", torch_dtype=torch.float16, device_map=device_map, ) @@ -288,11 +289,10 @@ def main(**kwargs) -> None: --model_name "meta-llama/Llama-3.2-1B" \\ --lr 5e-4 """ - # TODO:Remove TrainConfig() and update_config() as all params are passed in kwargs by parser train_config = TrainConfig() update_config(train_config, **kwargs) - dataset_config = generate_dataset_config(train_config.dataset) - update_config(dataset_config, **kwargs) + dataset_config_file = kwargs.pop("dataset_config", None) + dataset_config = generate_dataset_config(train_config.dataset, dataset_config_file) logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level) @@ -341,6 +341,7 @@ def main(**kwargs) -> None: if __name__ == "__main__": parser = get_finetune_parser() - args = parser.parse_args() + args, unk_args = parser.parse_known_args() + unk_args_dict = parse_unk_args(unk_args) args_dict = vars(args) - main(**args_dict) + main(**args_dict, **unk_args_dict) diff --git a/QEfficient/finetune/configs/dataset_config.py b/QEfficient/finetune/configs/dataset_config.py index 1f4fe094b..a895e727c 100644 --- a/QEfficient/finetune/configs/dataset_config.py +++ b/QEfficient/finetune/configs/dataset_config.py @@ -41,7 +41,5 @@ class imdb_dataset: @dataclass class custom_dataset: dataset: str = "custom_dataset" - file: str = "dataset/custom_dataset.py" train_split: str = "train" test_split: str = "validation" - data_path: str = "" diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index 4a1f500e3..f924a367f 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -26,18 +26,26 @@ def load_module_from_py_file(py_file: str) -> object: def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None): - if ":" in dataset_config.file: - module_path, func_name = dataset_config.file.split(":") + if not hasattr(dataset_config, "preproc_file"): + raise RuntimeError("Can not find preproc_file key in dataset_config file.") + + if ":" in dataset_config.preproc_file: + module_path, func_name = dataset_config.preproc_file.split(":") else: - module_path, func_name = dataset_config.file, "get_custom_dataset" + module_path, func_name = dataset_config.preproc_file, "get_custom_dataset" + print( + f"Using '{func_name}' function from " + f"{dataset_config.preproc_file} as preprocessing function in " + "dataset preprocessing." + ) if not module_path.endswith(".py"): - logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError) + logger.raise_error(f"Custom dataset preprocessing file {module_path} is not a .py file.", ValueError) module_path = Path(module_path) if not module_path.is_file(): logger.raise_error( - f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError + f"Custom dataset file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError ) module = load_module_from_py_file(module_path.as_posix()) @@ -45,24 +53,34 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non return getattr(module, func_name)(dataset_config, tokenizer, split, context_length) except AttributeError: logger.raise_error( - f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).", + f"For custom dataset preprocessing, the method ({func_name}) is not " + f"present in the file ({module_path.as_posix()}).", AttributeError, ) def get_data_collator(dataset_processer, dataset_config): - if ":" in dataset_config.file: - module_path, func_name = dataset_config.file.split(":") + if not hasattr(dataset_config, "collate_file"): + print( + f"Can not find collate_file key in dataset_config file. Using the default data collator function instead." + ) + return None + + if ":" in dataset_config.collate_file: + module_path, func_name = dataset_config.collate_file.split(":") else: - module_path, func_name = dataset_config.file, "get_data_collator" + module_path, func_name = dataset_config.collate_file, "get_data_collator" + print( + f"Using '{func_name}' function from {dataset_config.collate_file} as collate_fn in dataset preprocessing." + ) if not module_path.endswith(".py"): - logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError) + logger.raise_error(f"Custom dataset collate file {module_path} is not a .py file.", ValueError) module_path = Path(module_path) if not module_path.is_file(): logger.raise_error( - f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError + f"Custom dataset collate file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError ) module = load_module_from_py_file(module_path.as_posix()) @@ -70,7 +88,8 @@ def get_data_collator(dataset_processer, dataset_config): return getattr(module, func_name)(dataset_processer) except AttributeError: logger.log_rank_zero( - f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()})." + f"Can not find the function {func_name} in file " + f"({module_path.as_posix()}). Using the default data collator " + "function instead." ) - logger.log_rank_zero("Using the default data_collator instead.") return None diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py index 64f17fecb..158c9d43b 100644 --- a/QEfficient/finetune/utils/config_utils.py +++ b/QEfficient/finetune/utils/config_utils.py @@ -9,12 +9,13 @@ import json import os from dataclasses import asdict -from typing import Any, Dict +from typing import Any, Dict, Optional +from collections import namedtuple import yaml from peft import LoraConfig as PeftLoraConfig -import QEfficient.finetune.configs.dataset_config as datasets +import QEfficient.finetune.configs.dataset_config as qeff_datasets from QEfficient.finetune.configs.peft_config import LoraConfig from QEfficient.finetune.configs.training import TrainConfig from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC @@ -86,11 +87,14 @@ def generate_peft_config(train_config: TrainConfig, **kwargs) -> Any: return peft_config -def generate_dataset_config(dataset_name: str) -> Any: +def generate_dataset_config(dataset_name: str, custom_dataset_config: Optional[str] = None) -> Any: """Generate a dataset configuration based on the specified dataset. Args: dataset_name (str): Name of the dataset to be used for finetuning. + custom_dataset_config (str): Dataset config json file for custom datset. + This file contains dataset specific arguments to be used in dataset + preprocessing step. Returns: Any: A dataset configuration object. @@ -101,7 +105,15 @@ def generate_dataset_config(dataset_name: str) -> Any: supported_datasets = DATASET_PREPROC.keys() assert dataset_name in supported_datasets, f"Given dataset '{dataset_name}' is not supported." # FIXME (Meet): Replace below logic by creating using auto registry of datasets. - dataset_config = {k: v for k, v in inspect.getmembers(datasets)}[dataset_name]() + dataset_config = {k: v for k, v in inspect.getmembers(qeff_datasets)}[dataset_name]() + if dataset_name == "custom_dataset": + custom_dataset_dict = asdict(dataset_config) + custom_dataset_dict_override = load_config_file(custom_dataset_config) + # Override existing and add new params to dataset_config. + custom_dataset_dict.update(custom_dataset_dict_override) + + custom_dataset_class = namedtuple("custom_dataset", custom_dataset_dict.keys()) + dataset_config = custom_dataset_class(**custom_dataset_dict) return dataset_config diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py index fd584d8c0..f92f022ee 100644 --- a/QEfficient/finetune/utils/helper.py +++ b/QEfficient/finetune/utils/helper.py @@ -227,3 +227,14 @@ def save_to_json( } with open(output_filename, "w") as f: json.dump(metrics_data, f) +TASK_TYPE = ["generation", "seq_classification"] +PEFT_METHOD = ["lora"] +DEVICE = ["qaic", "cpu", "cuda"] +BATCHING_STRATEGY = ["padding", "packing"] + + +def parse_unk_args(unk_args_str): + if len(unk_args_str) % 2 != 0: + raise RuntimeError("Unknown arguments must be in pairs") + unk_args_dict = {unk_args_str[i].replace("--", ""): unk_args_str[i + 1] for i in range(0, len(unk_args_str), 2)} + return unk_args_dict diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py index ad53ae35d..ec8fb70d3 100644 --- a/QEfficient/finetune/utils/parser.py +++ b/QEfficient/finetune/utils/parser.py @@ -43,6 +43,20 @@ def get_finetune_parser(): default=None, help="Name of the tokenizer,if not passed as an argument, it uses the value of model_name", ) + parser.add_argument( + "--peft_config_file", + "--peft-config-file", + type=str, + default=None, + help="Path of PEFT config json file to override the PEFT config params such as lora_r, lora_alpha etc.", + ) + parser.add_argument( + "--custom_dataset_config", + "--custom-dataset-config", + type=str, + default=None, + help="Path of custom dataset config json file to override the custom dataset params such as test_split_ratio, test_split etc.", + ) parser.add_argument( "--run_validation", "--run-validation", diff --git a/docs/source/finetune.md b/docs/source/finetune.md index be8dfde00..8bb4d58fa 100644 --- a/docs/source/finetune.md +++ b/docs/source/finetune.md @@ -75,38 +75,67 @@ tensorboard --logdir runs/ --bind_all 1) Gradient accumulation: By default, gradient accumulation happens for 4 steps. To update this value, command line argument gradient_accumulation_steps has to be passed. (Example: '--gradient_accumulation_steps 8') 2) Gradient Checkpointing: By default, gradient checkpointing is disabled. To enable it, command line argument gradient_accumulation_steps has to be passed. -## Fine-Tuning on custom dataset -To run fine tuning for any user specific dataset, prepare the dataset using the following steps: - -1. Create a directory named 'dataset' inside efficient-transformers. -2. Inside this directory, create a file named 'custom_dataset.py'. -3. Inside the newly created efficient-transformers/dataset/custom_dataset.py, define a function named 'get_custom_dataset'. -4. get_custom_dataset() should have following 4 parameters: dataset_config, tokenizer, split, context_length. -5. Inside get_custom_dataset(), user needs to apply prompt and tokenize the dataset accordingly. Please refer the below template on how to define get_custom_dataset(). -6. For examples, please refer python files present in [dataset](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset). -7. In [dataset_config.py](https://github.com/quic/efficient-transformers/blob/main/QEfficient/finetune/configs/dataset_config.py), for custom_dataset class, pass the appropriate value for train_split and test_split. As an alternative, these values can be passed as command line arguments as well with the finetune command. For example "--train_split train". -8. While running fine tuning, pass argument "-–dataset custom_dataset" to finetune on custom dataset. - -Template for get_custom_dataset() to be defined inside efficient-transformers/dataset/custom_dataset.py is as follows: - -```python -def get_custom_dataset(dataset_config, tokenizer, split, context_length=None): - - # load dataset - # based on split, retrieve only the specific portion of the dataset (train or eval) either here or at the last - - def apply_prompt_template(): - # transform the passed datapoint by applying the prompt on it - - def tokenize(): - # tokenize the passed datapoint - - # define the prompt - # call apply_prompt_template() for each data point: - # dataset = dataset.map(apply_prompt_template ,) - # call tokenize() for each data point: - # dataset = dataset.map(tokenize, ) - - return dataset -``` +### 🔧 Steps to Fine-Tune with a Custom Dataset + +1. **Launching Fine-Tuning with a Custom Dataset** + Use the following command-line arguments to begin fine-tuning: + ``` + --dataset custom_dataset --dataset_config data_config.json + ``` + The `data_config.json` file contains essential parameters used during dataset preprocessing. + +2. **Specifying the Preprocessing Function** + - In `data_config.json`, include a `"preproc_file"` key to define the path to your preprocessing Python file. + - To specify a custom function within that file, use the format `"filename.py:function_name"`. + _Example:_ + ```json + "preproc_file": "disc_preproc.py:get_preprocessed_disc" + ``` + - Your preprocessing function must follow this structure: + ```python + def get_custom_dataset(dataset_config, tokenizer, split, context_length=None): + def apply_prompt_template(): + # Apply prompt formatting to each datapoint + + def tokenize(): + # Tokenize the formatted datapoint + + # Apply functions to dataset using map + dataset = dataset.map(apply_prompt_template, ...) + dataset = dataset.map(tokenize, ...) + + return dataset + ``` + +3. **Custom Collate Function for Batching** + - When using a batch size greater than 1, you may override the default collate behavior by including a `"collate_file"` key in `data_config.json`. + - Use the same `"file.py:function"` format. If omitted, the default Hugging Face `DataCollatorForSeq2Seq` is used, which pads sequences to the longest length in the batch. + - A custom collate function must have the following signature: + ```python + def get_data_collator(tokenizer): + # Define and return a custom collate_fn here + ``` + +4. **Passing Additional Configuration Parameters** + You can add custom arguments in `data_config.json`, which will be accessible via the `dataset_config` argument inside your `get_custom_dataset()` function. + +5. **Example `data_config.json` File** + ```json + { + "train_split": "train", + "test_split": "test", + "test_split_ratio": 0.15, + "preproc_file": "disc_preprocd.py:get_preprocessed_disc", + "collate_file": "disc_preprocd.py:get_collate_fn_disc", + "disc_style": "sarcasm_more" + } + ``` + +6. **Implementing Custom Preprocessing Logic** + Within your dataset loader function, define `apply_prompt_template()` to manipulate raw data into desired prompt format, and `tokenize()` to convert it into token IDs using the tokenizer. + +7. **Reference for Dataset Utilities** + You can refer to existing implementations in the [dataset directory of this repository](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset). + +--- From 8d71ae221661314fbc7ccebc4a18bb8f2b85e2e7 Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Mon, 21 Jul 2025 16:30:20 +0530 Subject: [PATCH 2/8] Minor changes to data collator call to explicitly pass -100 for pad token. Signed-off-by: meetkuma --- QEfficient/finetune/utils/dataset_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py index 0fb325c8a..d3a7e7185 100644 --- a/QEfficient/finetune/utils/dataset_utils.py +++ b/QEfficient/finetune/utils/dataset_utils.py @@ -64,8 +64,9 @@ def get_dataloader_kwargs(train_config, dataset, dataset_processer, split): kwargs["drop_last"] = False else: kwargs["batch_size"] = batch_size - kwargs["drop_last"] = False - kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer) + kwargs["drop_last"] = True + # todo: -100 should be changed to a variable. or tokenizer.pad_token_id + kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer, label_pad_token_id=-100) return kwargs From 2d7a88c4ca1c99bed3ed1de2f768041e16afa98a Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Mon, 21 Jul 2025 16:59:08 +0530 Subject: [PATCH 3/8] Removed redundant code changes based on recent merged PRs. Signed-off-by: meetkuma --- QEfficient/cloud/finetune.py | 6 ++---- QEfficient/finetune/dataset/custom_dataset.py | 18 +++++++++++------- QEfficient/finetune/utils/config_utils.py | 2 +- QEfficient/finetune/utils/helper.py | 11 ----------- QEfficient/finetune/utils/parser.py | 7 ------- 5 files changed, 14 insertions(+), 30 deletions(-) diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index ac69809a5..6a0d81d44 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -21,7 +21,6 @@ from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer from QEfficient.finetune.configs.training import TrainConfig -from QEfficient.finetune.utils.helper import parse_unk_args from QEfficient.finetune.utils.config_utils import ( generate_dataset_config, generate_peft_config, @@ -341,7 +340,6 @@ def main(**kwargs) -> None: if __name__ == "__main__": parser = get_finetune_parser() - args, unk_args = parser.parse_known_args() - unk_args_dict = parse_unk_args(unk_args) + args = parser.parse_args() args_dict = vars(args) - main(**args_dict, **unk_args_dict) + main(**args_dict) diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index f924a367f..a18447f29 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -6,6 +6,7 @@ # ----------------------------------------------------------------------------- import importlib +import logging from pathlib import Path from QEfficient.finetune.utils.logging_utils import logger @@ -27,16 +28,17 @@ def load_module_from_py_file(py_file: str) -> object: def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None): if not hasattr(dataset_config, "preproc_file"): - raise RuntimeError("Can not find preproc_file key in dataset_config file.") + logger.raise_error("Can not find preproc_file key in dataset_config file.", RuntimeError) if ":" in dataset_config.preproc_file: module_path, func_name = dataset_config.preproc_file.split(":") else: module_path, func_name = dataset_config.preproc_file, "get_custom_dataset" - print( + logger.log_rank_zero( f"Using '{func_name}' function from " f"{dataset_config.preproc_file} as preprocessing function in " - "dataset preprocessing." + "dataset preprocessing.", + logging.WARNING, ) if not module_path.endswith(".py"): @@ -61,8 +63,9 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non def get_data_collator(dataset_processer, dataset_config): if not hasattr(dataset_config, "collate_file"): - print( - f"Can not find collate_file key in dataset_config file. Using the default data collator function instead." + logger.log_rank_zero( + "Can not find collate_file key in dataset_config file. Using the default data collator function instead.", + logging.WARNING, ) return None @@ -70,8 +73,9 @@ def get_data_collator(dataset_processer, dataset_config): module_path, func_name = dataset_config.collate_file.split(":") else: module_path, func_name = dataset_config.collate_file, "get_data_collator" - print( - f"Using '{func_name}' function from {dataset_config.collate_file} as collate_fn in dataset preprocessing." + logger.log_rank_zero( + f"Using '{func_name}' function from {dataset_config.collate_file} as collate_fn in dataset preprocessing.", + logging.WARNING, ) if not module_path.endswith(".py"): diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py index 158c9d43b..c0c4c4386 100644 --- a/QEfficient/finetune/utils/config_utils.py +++ b/QEfficient/finetune/utils/config_utils.py @@ -8,9 +8,9 @@ import inspect import json import os +from collections import namedtuple from dataclasses import asdict from typing import Any, Dict, Optional -from collections import namedtuple import yaml from peft import LoraConfig as PeftLoraConfig diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py index f92f022ee..fd584d8c0 100644 --- a/QEfficient/finetune/utils/helper.py +++ b/QEfficient/finetune/utils/helper.py @@ -227,14 +227,3 @@ def save_to_json( } with open(output_filename, "w") as f: json.dump(metrics_data, f) -TASK_TYPE = ["generation", "seq_classification"] -PEFT_METHOD = ["lora"] -DEVICE = ["qaic", "cpu", "cuda"] -BATCHING_STRATEGY = ["padding", "packing"] - - -def parse_unk_args(unk_args_str): - if len(unk_args_str) % 2 != 0: - raise RuntimeError("Unknown arguments must be in pairs") - unk_args_dict = {unk_args_str[i].replace("--", ""): unk_args_str[i + 1] for i in range(0, len(unk_args_str), 2)} - return unk_args_dict diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py index ec8fb70d3..18dae8ec2 100644 --- a/QEfficient/finetune/utils/parser.py +++ b/QEfficient/finetune/utils/parser.py @@ -43,13 +43,6 @@ def get_finetune_parser(): default=None, help="Name of the tokenizer,if not passed as an argument, it uses the value of model_name", ) - parser.add_argument( - "--peft_config_file", - "--peft-config-file", - type=str, - default=None, - help="Path of PEFT config json file to override the PEFT config params such as lora_r, lora_alpha etc.", - ) parser.add_argument( "--custom_dataset_config", "--custom-dataset-config", From c1f6fd0b7affd07e4ae9b014339762fbcc5e6eea Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Mon, 21 Jul 2025 20:35:06 +0530 Subject: [PATCH 4/8] Added a sample custom dataset config, custom dataset preprocessing python code and peft config file. Signed-off-by: meetkuma --- QEfficient/cloud/finetune.py | 4 +- .../finetune/configs/sample_peft_config.json | 17 ++++ .../dataset/custom_dataset/disc_preproc.py | 87 +++++++++++++++++++ .../custom_dataset/sample_dataset_config.json | 7 ++ QEfficient/finetune/utils/config_utils.py | 5 ++ 5 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 QEfficient/finetune/configs/sample_peft_config.json create mode 100644 QEfficient/finetune/dataset/custom_dataset/disc_preproc.py create mode 100644 QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index 6a0d81d44..941024cb5 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -290,8 +290,8 @@ def main(**kwargs) -> None: """ train_config = TrainConfig() update_config(train_config, **kwargs) - dataset_config_file = kwargs.pop("dataset_config", None) - dataset_config = generate_dataset_config(train_config.dataset, dataset_config_file) + custom_dataset_config_file = kwargs.pop("custom_dataset_config", None) + dataset_config = generate_dataset_config(train_config.dataset, custom_dataset_config_file) logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level) diff --git a/QEfficient/finetune/configs/sample_peft_config.json b/QEfficient/finetune/configs/sample_peft_config.json new file mode 100644 index 000000000..c53c9c9dd --- /dev/null +++ b/QEfficient/finetune/configs/sample_peft_config.json @@ -0,0 +1,17 @@ +{ + "r": 32, + "lora_alpha": 64, + "target_modules": [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "up_proj", + "down_proj", + "gate_proj" + ], + "bias": "none", + "task_type": "CAUSAL_LM", + "lora_dropout": 0.05, + "inference_mode": false +} \ No newline at end of file diff --git a/QEfficient/finetune/dataset/custom_dataset/disc_preproc.py b/QEfficient/finetune/dataset/custom_dataset/disc_preproc.py new file mode 100644 index 000000000..d06a40b94 --- /dev/null +++ b/QEfficient/finetune/dataset/custom_dataset/disc_preproc.py @@ -0,0 +1,87 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + + +import datasets +from transformers.data import DataCollatorForSeq2Seq + + +def get_data_collator(tokenizer): + return DataCollatorForSeq2Seq(tokenizer) + + +def get_preprocessed_disc(dataset_config, tokenizer, split, context_length=None): + dataset = datasets.load_dataset("hallisky/DiSC") + + # Considering 'train' split as this dataset has only one split. + dataset = dataset["train"] + + test_split_ratio = dataset_config.test_split_ratio + disc_style = dataset_config.disc_style + + # Only collect the samples for a given style. + available_styles = set(dataset["category"]) + if disc_style not in available_styles: + raise RuntimeError(f"For DiSC dataset the provided disc_stype '{disc_style}' is not supported.") + + dataset = dataset.filter(lambda example: example["category"] == disc_style) + + # Shuffle the dataset before splitting + dataset = dataset.shuffle(seed=42) + + # Split the data in train and test split. + total_samples = len(dataset) + test_size = int(total_samples * test_split_ratio) + train_size = total_samples - test_size + + if split == "test": + indices = range(train_size, total_samples) + else: + indices = range(0, train_size) + + dataset = dataset.select(indices) + + if tokenizer.pad_token is None: + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + + # Below is the template of the DiSC dataset. + # ### Original:{original} \n ### Rewrite: {rewrite} + template = "### Original:{original} \n ### Rewrite: " + + def apply_prompt_template(sample): + return { + "input": template.format(original=sample["original"]), + "label": sample["generation"], + } + + dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features)) + + def tokenize_add_label(sample): + input = tokenizer.encode( + tokenizer.bos_token + sample["input"], + add_special_tokens=False, + max_length=context_length, + pad_to_max_length=True, + ) + label = tokenizer.encode( + sample["label"] + tokenizer.pad_token + tokenizer.eos_token, + add_special_tokens=False, + max_length=context_length, + pad_to_max_length=True, + ) + + sample = { + "input_ids": (input + label), + "attention_mask": [1] * (len(input) + len(label)), + "labels": [-100] * len(input) + label, + } + + return sample + + dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features)) + + return dataset diff --git a/QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json new file mode 100644 index 000000000..2f6be9bc7 --- /dev/null +++ b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json @@ -0,0 +1,7 @@ +{ + "train_split": "train", + "test_split": "test", + "test_split_ratio": 0.15, + "preproc_file": "./QEfficient/finetune/dataset/custom_dataset/disc_preproc.py:get_preprocessed_disc", + "disc_style": "sarcasm_more" +} \ No newline at end of file diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py index c0c4c4386..0c8b3d827 100644 --- a/QEfficient/finetune/utils/config_utils.py +++ b/QEfficient/finetune/utils/config_utils.py @@ -107,6 +107,11 @@ def generate_dataset_config(dataset_name: str, custom_dataset_config: Optional[s # FIXME (Meet): Replace below logic by creating using auto registry of datasets. dataset_config = {k: v for k, v in inspect.getmembers(qeff_datasets)}[dataset_name]() if dataset_name == "custom_dataset": + if custom_dataset_config is None: + logger.raise_error( + "For 'custom_dataset', please provide dataset config file via 'custom_dataset_config' flag.", + RuntimeError, + ) custom_dataset_dict = asdict(dataset_config) custom_dataset_dict_override = load_config_file(custom_dataset_config) # Override existing and add new params to dataset_config. From ef4740e92218b5ba2bf3384e30f5b8df36361e6c Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Wed, 23 Jul 2025 16:06:26 +0530 Subject: [PATCH 5/8] Changed attention back to sdpa from eager. Signed-off-by: meetkuma --- QEfficient/cloud/finetune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index 941024cb5..d8de58951 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -133,7 +133,7 @@ def load_model_and_tokenizer( model = AutoModelForSequenceClassification.from_pretrained( pretrained_model_path, num_labels=dataset_config.num_labels, - attn_implementation="eager", + attn_implementation="sdpa", torch_dtype=torch.float16, ) @@ -151,7 +151,7 @@ def load_model_and_tokenizer( model = AutoModelForCausalLM.from_pretrained( pretrained_model_path, use_cache=False, - attn_implementation="eager", + attn_implementation="sdpa", torch_dtype=torch.float16, device_map=device_map, ) From 172625759f4c70cb35964c2be6694e0fb11df838 Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Mon, 4 Aug 2025 14:48:10 +0530 Subject: [PATCH 6/8] Updated documentation based on review comments. Signed-off-by: meetkuma --- QEfficient/finetune/dataset/custom_dataset.py | 4 +- ...c_preproc.py => sample_dataset_preproc.py} | 2 +- docs/source/finetune.md | 135 ++++++++++-------- 3 files changed, 79 insertions(+), 62 deletions(-) rename QEfficient/finetune/dataset/custom_dataset/{disc_preproc.py => sample_dataset_preproc.py} (98%) diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index a18447f29..6a8af0a23 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -33,7 +33,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non if ":" in dataset_config.preproc_file: module_path, func_name = dataset_config.preproc_file.split(":") else: - module_path, func_name = dataset_config.preproc_file, "get_custom_dataset" + module_path, func_name = dataset_config.preproc_file, "preproc_fn" logger.log_rank_zero( f"Using '{func_name}' function from " f"{dataset_config.preproc_file} as preprocessing function in " @@ -72,7 +72,7 @@ def get_data_collator(dataset_processer, dataset_config): if ":" in dataset_config.collate_file: module_path, func_name = dataset_config.collate_file.split(":") else: - module_path, func_name = dataset_config.collate_file, "get_data_collator" + module_path, func_name = dataset_config.collate_file, "data_collator_fn" logger.log_rank_zero( f"Using '{func_name}' function from {dataset_config.collate_file} as collate_fn in dataset preprocessing.", logging.WARNING, diff --git a/QEfficient/finetune/dataset/custom_dataset/disc_preproc.py b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py similarity index 98% rename from QEfficient/finetune/dataset/custom_dataset/disc_preproc.py rename to QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py index d06a40b94..78db5674c 100644 --- a/QEfficient/finetune/dataset/custom_dataset/disc_preproc.py +++ b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py @@ -26,7 +26,7 @@ def get_preprocessed_disc(dataset_config, tokenizer, split, context_length=None) # Only collect the samples for a given style. available_styles = set(dataset["category"]) if disc_style not in available_styles: - raise RuntimeError(f"For DiSC dataset the provided disc_stype '{disc_style}' is not supported.") + raise RuntimeError(f"For DiSC dataset the provided disc_style '{disc_style}' is not supported.") dataset = dataset.filter(lambda example: example["category"] == disc_style) diff --git a/docs/source/finetune.md b/docs/source/finetune.md index 8bb4d58fa..f8519d8c3 100644 --- a/docs/source/finetune.md +++ b/docs/source/finetune.md @@ -78,64 +78,81 @@ tensorboard --logdir runs/ --bind_all ### 🔧 Steps to Fine-Tune with a Custom Dataset -1. **Launching Fine-Tuning with a Custom Dataset** - Use the following command-line arguments to begin fine-tuning: - ``` - --dataset custom_dataset --dataset_config data_config.json - ``` - The `data_config.json` file contains essential parameters used during dataset preprocessing. - -2. **Specifying the Preprocessing Function** - - In `data_config.json`, include a `"preproc_file"` key to define the path to your preprocessing Python file. - - To specify a custom function within that file, use the format `"filename.py:function_name"`. - _Example:_ - ```json - "preproc_file": "disc_preproc.py:get_preprocessed_disc" - ``` - - Your preprocessing function must follow this structure: - ```python - def get_custom_dataset(dataset_config, tokenizer, split, context_length=None): - def apply_prompt_template(): - # Apply prompt formatting to each datapoint - - def tokenize(): - # Tokenize the formatted datapoint - - # Apply functions to dataset using map - dataset = dataset.map(apply_prompt_template, ...) - dataset = dataset.map(tokenize, ...) +1. **Launching Fine-Tuning with a Custom Dataset** + - Use the following command-line arguments to begin fine-tuning using a custom dataset: + ```bash + --dataset custom_dataset --dataset_config data_config.json + ``` + - The `--dataset_config` argument is mandatory when `--dataset custom_dataset` is specified. The `data_config.json` file contains essential parameters used during dataset preprocessing. + + __Example `data_config.json` File__ + ```json + { + "train_split": "train", + "test_split": "test", + "test_split_ratio": 0.15, + "preproc_file": "sample_dataset_preproc.py:preprocessing_fn", + "collate_file": "sample_dataset_preproc.py:data_collate_fn", + "disc_style": "sarcasm_more" + } + ``` + +2. **Specifying the Preprocessing Function** + - In `data_config.json`, include a `"preproc_file"` key to define the path to your preprocessing Python file and the function within it. + - Use the format `"filename.py:function_name"`. In the absence of `function_name`, the `preproc_fn` function will be used as default preprocessing function from the `filename.py`. + _Example:_ + ```json + "preproc_file": "sample_dataset_preproc.py:preprocessing_fn" + ``` + - The preprocessing function must follow the structure below. The signature of the function should not be altered. The sample illustrates `apply_prompt_template` and `tokenize` as sub-functions, but we can define our own sub-functions as needed. For reference, check the example files in the [./QEfficient/finetune/dataset/](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset) directory. + ```python + def preprocessing_fn(dataset_config, tokenizer, split, context_length=None): + # Load the dataset or read from the disk + # ... + + # Split the dataset into train and test splits if needed, + # and use the appropriate split based on the 'split' argument. + # ... + + def apply_prompt_template(example): + # Apply prompt formatting to each datapoint (e.g., example) + # ... + return example # Return the processed example + + def tokenize(example): + # Tokenize the formatted datapoint (e.g., example) + # ... + return tokenizer(example["text"], truncation=True, max_length=context_length) # Example tokenization + + # Apply prompt template to preprocess it in accordance to the dataset and task. + dataset = dataset.map(apply_prompt_template, ...) + + # Finally, tokenize the dataset + dataset = dataset.map(tokenize, batched=True, remove_columns=['text']) # Example batched tokenization + + # Each sample in the dataset should have keys acceptable by the HF + # model and the loss function. + # Typically, for CausalLM models used with 'generation' task_mode, + # the keys should be 'input_ids', 'attention_mask', and 'labels'. + return dataset + ``` + - In the sample preprocessing function above, the `split` variable takes its value from `data_config.json`. For the training dataset, the value will be taken from the `"train_split"` key, and for the evaluation/test dataset, it will be taken from the `"test_split"` key. + - Additional arguments needed for the preprocessing function can be passed in `data_config.json` and will be available via the `dataset_config` variable within the function. For instance, in the sample config above, `"test_split_ratio"` and `"disc_style"` keys can be used in the preprocessing function to define the test split ratio and style of the dataset. These values are accessed through the `dataset_config` variable. Check out the sample preprocessing file at [./QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py). + + + +3. **Custom Collate Function for Batching** + - When using a batch size greater than 1, we may need to override the default collate (batching different samples together in a batch) behavior by including a `"collate_file"` key in `data_config.json`. + - Use the same `"file.py:function"` format. If omitted, the default Hugging Face `DataCollatorForSeq2Seq` is typically used, which pads sequences to the longest length in the batch. + - A custom collate function must have the following signature. The signature of the function should not be altered: + ```python + def get_data_collator(tokenizer): + # Define and return a custom collate_fn here + # ... - return dataset - ``` - -3. **Custom Collate Function for Batching** - - When using a batch size greater than 1, you may override the default collate behavior by including a `"collate_file"` key in `data_config.json`. - - Use the same `"file.py:function"` format. If omitted, the default Hugging Face `DataCollatorForSeq2Seq` is used, which pads sequences to the longest length in the batch. - - A custom collate function must have the following signature: - ```python - def get_data_collator(tokenizer): - # Define and return a custom collate_fn here - ``` - -4. **Passing Additional Configuration Parameters** - You can add custom arguments in `data_config.json`, which will be accessible via the `dataset_config` argument inside your `get_custom_dataset()` function. - -5. **Example `data_config.json` File** - ```json - { - "train_split": "train", - "test_split": "test", - "test_split_ratio": 0.15, - "preproc_file": "disc_preprocd.py:get_preprocessed_disc", - "collate_file": "disc_preprocd.py:get_collate_fn_disc", - "disc_style": "sarcasm_more" - } - ``` - -6. **Implementing Custom Preprocessing Logic** - Within your dataset loader function, define `apply_prompt_template()` to manipulate raw data into desired prompt format, and `tokenize()` to convert it into token IDs using the tokenizer. - -7. **Reference for Dataset Utilities** - You can refer to existing implementations in the [dataset directory of this repository](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset). - + # This function should take a list of samples and return a batch. + # Example: + # from transformers import DataCollatorForLanguageModeling + # return DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + ``` --- From f42e5b8120ff7250dd6d711dc78cd9f70bbdd9fa Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Mon, 4 Aug 2025 19:00:48 +0530 Subject: [PATCH 7/8] Made preproc file name and function name as mandatory args and updated documentation accordingly. Signed-off-by: meetkuma --- QEfficient/finetune/dataset/custom_dataset.py | 36 +++++++++++-------- QEfficient/finetune/dataset/dataset_config.py | 3 +- docs/source/finetune.md | 4 +-- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index 6a8af0a23..ef76e83ed 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -30,17 +30,18 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non if not hasattr(dataset_config, "preproc_file"): logger.raise_error("Can not find preproc_file key in dataset_config file.", RuntimeError) - if ":" in dataset_config.preproc_file: - module_path, func_name = dataset_config.preproc_file.split(":") - else: - module_path, func_name = dataset_config.preproc_file, "preproc_fn" - logger.log_rank_zero( - f"Using '{func_name}' function from " - f"{dataset_config.preproc_file} as preprocessing function in " - "dataset preprocessing.", - logging.WARNING, + if ":" not in dataset_config.preproc_file: + logger.raise_error( + "The 'preproc_file' key in dataset_config file should follow the format: python_file_path:function_name", + RuntimeError, ) + module_path, func_name = dataset_config.preproc_file.split(":") + logger.log_rank_zero( + f"Using '{func_name}' function from {module_path} as preprocessing function in dataset preprocessing.", + logging.DEBUG, + ) + if not module_path.endswith(".py"): logger.raise_error(f"Custom dataset preprocessing file {module_path} is not a .py file.", ValueError) @@ -69,14 +70,21 @@ def get_data_collator(dataset_processer, dataset_config): ) return None - if ":" in dataset_config.collate_file: - module_path, func_name = dataset_config.collate_file.split(":") - else: - module_path, func_name = dataset_config.collate_file, "data_collator_fn" + if ":" not in dataset_config.collate_file: logger.log_rank_zero( - f"Using '{func_name}' function from {dataset_config.collate_file} as collate_fn in dataset preprocessing.", + "Can not find function name in 'collate_file' key in dataset_config " + "file. Using the default data collator function instead. If this is " + "not intended then change the format of the 'collate_file' key in " + "dataset_config file to follow the format: python_file_path:function_name", logging.WARNING, ) + return None + else: + module_path, func_name = dataset_config.collate_file.split(":") + logger.log_rank_zero( + f"Using '{func_name}' function from {module_path} as collate_fn in dataset preprocessing.", + logging.DEBUG, + ) if not module_path.endswith(".py"): logger.raise_error(f"Custom dataset collate file {module_path} is not a .py file.", ValueError) diff --git a/QEfficient/finetune/dataset/dataset_config.py b/QEfficient/finetune/dataset/dataset_config.py index 2e477be77..b766e923c 100644 --- a/QEfficient/finetune/dataset/dataset_config.py +++ b/QEfficient/finetune/dataset/dataset_config.py @@ -5,7 +5,6 @@ # # ----------------------------------------------------------------------------- -from functools import partial from QEfficient.finetune.dataset.alpaca_dataset import ( InstructionDataset as get_alpaca_dataset, @@ -23,7 +22,7 @@ ) DATASET_PREPROC = { - "alpaca_dataset": partial(get_alpaca_dataset), + "alpaca_dataset": get_alpaca_dataset, "grammar_dataset": get_grammar_dataset, "gsm8k_dataset": get_gsm8k_dataset, "custom_dataset": get_custom_dataset, diff --git a/docs/source/finetune.md b/docs/source/finetune.md index f8519d8c3..2f05282a8 100644 --- a/docs/source/finetune.md +++ b/docs/source/finetune.md @@ -98,8 +98,8 @@ tensorboard --logdir runs/ --bind_all ``` 2. **Specifying the Preprocessing Function** - - In `data_config.json`, include a `"preproc_file"` key to define the path to your preprocessing Python file and the function within it. - - Use the format `"filename.py:function_name"`. In the absence of `function_name`, the `preproc_fn` function will be used as default preprocessing function from the `filename.py`. + - In `data_config.json`, include a `"preproc_file"` mandatory key to define the path to your preprocessing Python file and the function within it. + - Use the format `"filename.py:function_name"`. The filename and function name both are required. _Example:_ ```json "preproc_file": "sample_dataset_preproc.py:preprocessing_fn" From bfad6f0671e072ab836bcaf62a0a637315293e21 Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Tue, 5 Aug 2025 10:18:14 +0530 Subject: [PATCH 8/8] Minor correction to the documentation. Signed-off-by: meetkuma --- docs/source/finetune.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/source/finetune.md b/docs/source/finetune.md index 2f05282a8..311605709 100644 --- a/docs/source/finetune.md +++ b/docs/source/finetune.md @@ -104,7 +104,7 @@ tensorboard --logdir runs/ --bind_all ```json "preproc_file": "sample_dataset_preproc.py:preprocessing_fn" ``` - - The preprocessing function must follow the structure below. The signature of the function should not be altered. The sample illustrates `apply_prompt_template` and `tokenize` as sub-functions, but we can define our own sub-functions as needed. For reference, check the example files in the [./QEfficient/finetune/dataset/](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset) directory. + - The preprocessing function must follow the structure below. The function parameters and the return type of the function should not be altered. The sample illustrates `apply_prompt_template` and `tokenize` as sub-functions, but we can define our own sub-functions as needed. For reference, check the example files in the [./QEfficient/finetune/dataset/](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset) directory. ```python def preprocessing_fn(dataset_config, tokenizer, split, context_length=None): # Load the dataset or read from the disk @@ -139,12 +139,10 @@ tensorboard --logdir runs/ --bind_all - In the sample preprocessing function above, the `split` variable takes its value from `data_config.json`. For the training dataset, the value will be taken from the `"train_split"` key, and for the evaluation/test dataset, it will be taken from the `"test_split"` key. - Additional arguments needed for the preprocessing function can be passed in `data_config.json` and will be available via the `dataset_config` variable within the function. For instance, in the sample config above, `"test_split_ratio"` and `"disc_style"` keys can be used in the preprocessing function to define the test split ratio and style of the dataset. These values are accessed through the `dataset_config` variable. Check out the sample preprocessing file at [./QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py). - - 3. **Custom Collate Function for Batching** - When using a batch size greater than 1, we may need to override the default collate (batching different samples together in a batch) behavior by including a `"collate_file"` key in `data_config.json`. - Use the same `"file.py:function"` format. If omitted, the default Hugging Face `DataCollatorForSeq2Seq` is typically used, which pads sequences to the longest length in the batch. - - A custom collate function must have the following signature. The signature of the function should not be altered: + - A custom collate function must follow the structure below. The function parameters and the return type of the function should not be altered: ```python def get_data_collator(tokenizer): # Define and return a custom collate_fn here