From a8e6a4a72fc9985a80047f548294df2e3693b2a3 Mon Sep 17 00:00:00 2001
From: Meet Patel <meetkuma@qti.qualcomm.com>
Date: Fri, 27 Jun 2025 21:17:54 +0530
Subject: [PATCH 1/8] Updated handling of custom dataset in FT. Updated
 finetune.md readme accordingly.

Signed-off-by: meetkuma <meetkuma@qti.qualcomm.com>
---
 QEfficient/cloud/finetune.py                  | 15 +--
 QEfficient/finetune/configs/dataset_config.py |  2 -
 QEfficient/finetune/dataset/custom_dataset.py | 45 ++++++---
 QEfficient/finetune/utils/config_utils.py     | 20 +++-
 QEfficient/finetune/utils/helper.py           | 11 +++
 QEfficient/finetune/utils/parser.py           | 14 +++
 docs/source/finetune.md                       | 97 ++++++++++++-------
 7 files changed, 144 insertions(+), 60 deletions(-)

diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
index 8b59aa6a9..ac69809a5 100644
--- a/QEfficient/cloud/finetune.py
+++ b/QEfficient/cloud/finetune.py
@@ -21,6 +21,7 @@
 from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 
 from QEfficient.finetune.configs.training import TrainConfig
+from QEfficient.finetune.utils.helper import parse_unk_args
 from QEfficient.finetune.utils.config_utils import (
     generate_dataset_config,
     generate_peft_config,
@@ -133,7 +134,7 @@ def load_model_and_tokenizer(
         model = AutoModelForSequenceClassification.from_pretrained(
             pretrained_model_path,
             num_labels=dataset_config.num_labels,
-            attn_implementation="sdpa",
+            attn_implementation="eager",
             torch_dtype=torch.float16,
         )
 
@@ -151,7 +152,7 @@ def load_model_and_tokenizer(
         model = AutoModelForCausalLM.from_pretrained(
             pretrained_model_path,
             use_cache=False,
-            attn_implementation="sdpa",
+            attn_implementation="eager",
             torch_dtype=torch.float16,
             device_map=device_map,
         )
@@ -288,11 +289,10 @@ def main(**kwargs) -> None:
                 --model_name "meta-llama/Llama-3.2-1B" \\
                 --lr 5e-4
     """
-    # TODO:Remove TrainConfig() and update_config() as all params are passed in kwargs by parser
     train_config = TrainConfig()
     update_config(train_config, **kwargs)
-    dataset_config = generate_dataset_config(train_config.dataset)
-    update_config(dataset_config, **kwargs)
+    dataset_config_file = kwargs.pop("dataset_config", None)
+    dataset_config = generate_dataset_config(train_config.dataset, dataset_config_file)
 
     logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level)
 
@@ -341,6 +341,7 @@ def main(**kwargs) -> None:
 
 if __name__ == "__main__":
     parser = get_finetune_parser()
-    args = parser.parse_args()
+    args, unk_args = parser.parse_known_args()
+    unk_args_dict = parse_unk_args(unk_args)
     args_dict = vars(args)
-    main(**args_dict)
+    main(**args_dict, **unk_args_dict)
diff --git a/QEfficient/finetune/configs/dataset_config.py b/QEfficient/finetune/configs/dataset_config.py
index 1f4fe094b..a895e727c 100644
--- a/QEfficient/finetune/configs/dataset_config.py
+++ b/QEfficient/finetune/configs/dataset_config.py
@@ -41,7 +41,5 @@ class imdb_dataset:
 @dataclass
 class custom_dataset:
     dataset: str = "custom_dataset"
-    file: str = "dataset/custom_dataset.py"
     train_split: str = "train"
     test_split: str = "validation"
-    data_path: str = ""
diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
index 4a1f500e3..f924a367f 100644
--- a/QEfficient/finetune/dataset/custom_dataset.py
+++ b/QEfficient/finetune/dataset/custom_dataset.py
@@ -26,18 +26,26 @@ def load_module_from_py_file(py_file: str) -> object:
 
 
 def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None):
-    if ":" in dataset_config.file:
-        module_path, func_name = dataset_config.file.split(":")
+    if not hasattr(dataset_config, "preproc_file"):
+        raise RuntimeError("Can not find preproc_file key in dataset_config file.")
+
+    if ":" in dataset_config.preproc_file:
+        module_path, func_name = dataset_config.preproc_file.split(":")
     else:
-        module_path, func_name = dataset_config.file, "get_custom_dataset"
+        module_path, func_name = dataset_config.preproc_file, "get_custom_dataset"
+        print(
+            f"Using '{func_name}' function from "
+            f"{dataset_config.preproc_file} as preprocessing function in "
+            "dataset preprocessing."
+        )
 
     if not module_path.endswith(".py"):
-        logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
+        logger.raise_error(f"Custom dataset preprocessing file {module_path} is not a .py file.", ValueError)
 
     module_path = Path(module_path)
     if not module_path.is_file():
         logger.raise_error(
-            f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
+            f"Custom dataset file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
         )
 
     module = load_module_from_py_file(module_path.as_posix())
@@ -45,24 +53,34 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non
         return getattr(module, func_name)(dataset_config, tokenizer, split, context_length)
     except AttributeError:
         logger.raise_error(
-            f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).",
+            f"For custom dataset preprocessing, the method ({func_name}) is not "
+            f"present in the file ({module_path.as_posix()}).",
             AttributeError,
         )
 
 
 def get_data_collator(dataset_processer, dataset_config):
-    if ":" in dataset_config.file:
-        module_path, func_name = dataset_config.file.split(":")
+    if not hasattr(dataset_config, "collate_file"):
+        print(
+            f"Can not find collate_file key in dataset_config file. Using the default data collator function instead."
+        )
+        return None
+
+    if ":" in dataset_config.collate_file:
+        module_path, func_name = dataset_config.collate_file.split(":")
     else:
-        module_path, func_name = dataset_config.file, "get_data_collator"
+        module_path, func_name = dataset_config.collate_file, "get_data_collator"
+        print(
+            f"Using '{func_name}' function from {dataset_config.collate_file} as collate_fn in dataset preprocessing."
+        )
 
     if not module_path.endswith(".py"):
-        logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
+        logger.raise_error(f"Custom dataset collate file {module_path} is not a .py file.", ValueError)
 
     module_path = Path(module_path)
     if not module_path.is_file():
         logger.raise_error(
-            f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
+            f"Custom dataset collate file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
         )
 
     module = load_module_from_py_file(module_path.as_posix())
@@ -70,7 +88,8 @@ def get_data_collator(dataset_processer, dataset_config):
         return getattr(module, func_name)(dataset_processer)
     except AttributeError:
         logger.log_rank_zero(
-            f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()})."
+            f"Can not find the function {func_name} in file "
+            f"({module_path.as_posix()}). Using the default data collator "
+            "function instead."
         )
-        logger.log_rank_zero("Using the default data_collator instead.")
         return None
diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py
index 64f17fecb..158c9d43b 100644
--- a/QEfficient/finetune/utils/config_utils.py
+++ b/QEfficient/finetune/utils/config_utils.py
@@ -9,12 +9,13 @@
 import json
 import os
 from dataclasses import asdict
-from typing import Any, Dict
+from typing import Any, Dict, Optional
+from collections import namedtuple
 
 import yaml
 from peft import LoraConfig as PeftLoraConfig
 
-import QEfficient.finetune.configs.dataset_config as datasets
+import QEfficient.finetune.configs.dataset_config as qeff_datasets
 from QEfficient.finetune.configs.peft_config import LoraConfig
 from QEfficient.finetune.configs.training import TrainConfig
 from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC
@@ -86,11 +87,14 @@ def generate_peft_config(train_config: TrainConfig, **kwargs) -> Any:
     return peft_config
 
 
-def generate_dataset_config(dataset_name: str) -> Any:
+def generate_dataset_config(dataset_name: str, custom_dataset_config: Optional[str] = None) -> Any:
     """Generate a dataset configuration based on the specified dataset.
 
     Args:
         dataset_name (str): Name of the dataset to be used for finetuning.
+        custom_dataset_config (str): Dataset config json file for custom datset.
+            This file contains dataset specific arguments to be used in dataset
+            preprocessing step.
 
     Returns:
         Any: A dataset configuration object.
@@ -101,7 +105,15 @@ def generate_dataset_config(dataset_name: str) -> Any:
     supported_datasets = DATASET_PREPROC.keys()
     assert dataset_name in supported_datasets, f"Given dataset '{dataset_name}' is not supported."
     # FIXME (Meet): Replace below logic by creating using auto registry of datasets.
-    dataset_config = {k: v for k, v in inspect.getmembers(datasets)}[dataset_name]()
+    dataset_config = {k: v for k, v in inspect.getmembers(qeff_datasets)}[dataset_name]()
+    if dataset_name == "custom_dataset":
+        custom_dataset_dict = asdict(dataset_config)
+        custom_dataset_dict_override = load_config_file(custom_dataset_config)
+        # Override existing and add new params to dataset_config.
+        custom_dataset_dict.update(custom_dataset_dict_override)
+
+        custom_dataset_class = namedtuple("custom_dataset", custom_dataset_dict.keys())
+        dataset_config = custom_dataset_class(**custom_dataset_dict)
     return dataset_config
 
 
diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py
index fd584d8c0..f92f022ee 100644
--- a/QEfficient/finetune/utils/helper.py
+++ b/QEfficient/finetune/utils/helper.py
@@ -227,3 +227,14 @@ def save_to_json(
     }
     with open(output_filename, "w") as f:
         json.dump(metrics_data, f)
+TASK_TYPE = ["generation", "seq_classification"]
+PEFT_METHOD = ["lora"]
+DEVICE = ["qaic", "cpu", "cuda"]
+BATCHING_STRATEGY = ["padding", "packing"]
+
+
+def parse_unk_args(unk_args_str):
+    if len(unk_args_str) % 2 != 0:
+        raise RuntimeError("Unknown arguments must be in pairs")
+    unk_args_dict = {unk_args_str[i].replace("--", ""): unk_args_str[i + 1] for i in range(0, len(unk_args_str), 2)}
+    return unk_args_dict
diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py
index ad53ae35d..ec8fb70d3 100644
--- a/QEfficient/finetune/utils/parser.py
+++ b/QEfficient/finetune/utils/parser.py
@@ -43,6 +43,20 @@ def get_finetune_parser():
         default=None,
         help="Name of the tokenizer,if not passed as an argument, it uses the value of model_name",
     )
+    parser.add_argument(
+        "--peft_config_file",
+        "--peft-config-file",
+        type=str,
+        default=None,
+        help="Path of PEFT config json file to override the PEFT config params such as lora_r, lora_alpha etc.",
+    )
+    parser.add_argument(
+        "--custom_dataset_config",
+        "--custom-dataset-config",
+        type=str,
+        default=None,
+        help="Path of custom dataset config json file to override the custom dataset params such as test_split_ratio, test_split etc.",
+    )
     parser.add_argument(
         "--run_validation",
         "--run-validation",
diff --git a/docs/source/finetune.md b/docs/source/finetune.md
index be8dfde00..8bb4d58fa 100644
--- a/docs/source/finetune.md
+++ b/docs/source/finetune.md
@@ -75,38 +75,67 @@ tensorboard --logdir runs/<file> --bind_all
     1) Gradient accumulation: By default, gradient accumulation happens for 4 steps. To update this value, command line argument gradient_accumulation_steps has to be passed. (Example: '--gradient_accumulation_steps 8')
     2) Gradient Checkpointing: By default, gradient checkpointing is disabled. To enable it, command line argument gradient_accumulation_steps has to be passed.
 
-## Fine-Tuning on custom dataset
 
-To run fine tuning for any user specific dataset, prepare the dataset using the following steps:
-
-1. Create a directory named 'dataset' inside efficient-transformers.
-2. Inside this directory, create a file named 'custom_dataset.py'.
-3. Inside the newly created efficient-transformers/dataset/custom_dataset.py, define a function named 'get_custom_dataset'. 
-4. get_custom_dataset() should have following 4 parameters:  dataset_config, tokenizer, split, context_length.  
-5. Inside get_custom_dataset(), user needs to apply prompt and tokenize the dataset accordingly. Please refer the below template on how to define get_custom_dataset().
-6. For examples, please refer python files present in [dataset](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset).
-7. In [dataset_config.py](https://github.com/quic/efficient-transformers/blob/main/QEfficient/finetune/configs/dataset_config.py), for custom_dataset class, pass the appropriate value for train_split and test_split. As an alternative, these values can be passed as command line arguments as well with the finetune command. For example "--train_split train".
-8. While running fine tuning, pass argument "-–dataset custom_dataset" to finetune on custom dataset.   
-
-Template for get_custom_dataset() to be defined inside efficient-transformers/dataset/custom_dataset.py is as follows:
-
-```python
-def get_custom_dataset(dataset_config, tokenizer, split, context_length=None):
-
-    # load dataset
-    # based on split, retrieve only the specific portion of the dataset (train or eval) either here or at the last
-    
-    def apply_prompt_template():
-        # transform the passed datapoint by applying the prompt on it 
-    
-    def tokenize():
-        # tokenize the passed datapoint
-    
-    # define the prompt
-    # call apply_prompt_template() for each data point:
-    # dataset = dataset.map(apply_prompt_template ,<other args>)
-    # call tokenize() for each data point:
-    # dataset = dataset.map(tokenize, <other args>)
-    
-    return dataset
-```
+### 🔧 Steps to Fine-Tune with a Custom Dataset
+
+1. **Launching Fine-Tuning with a Custom Dataset**  
+   Use the following command-line arguments to begin fine-tuning:
+   ```
+   --dataset custom_dataset --dataset_config data_config.json
+   ```
+   The `data_config.json` file contains essential parameters used during dataset preprocessing.
+
+2. **Specifying the Preprocessing Function**  
+   - In `data_config.json`, include a `"preproc_file"` key to define the path to your preprocessing Python file.
+   - To specify a custom function within that file, use the format `"filename.py:function_name"`.  
+     _Example:_  
+     ```json
+     "preproc_file": "disc_preproc.py:get_preprocessed_disc"
+     ```
+   - Your preprocessing function must follow this structure:
+     ```python
+     def get_custom_dataset(dataset_config, tokenizer, split, context_length=None):
+         def apply_prompt_template():
+             # Apply prompt formatting to each datapoint
+
+         def tokenize():
+             # Tokenize the formatted datapoint
+
+         # Apply functions to dataset using map
+         dataset = dataset.map(apply_prompt_template, ...)
+         dataset = dataset.map(tokenize, ...)
+         
+         return dataset
+     ```
+
+3. **Custom Collate Function for Batching**  
+   - When using a batch size greater than 1, you may override the default collate behavior by including a `"collate_file"` key in `data_config.json`.
+   - Use the same `"file.py:function"` format. If omitted, the default Hugging Face `DataCollatorForSeq2Seq` is used, which pads sequences to the longest length in the batch.
+   - A custom collate function must have the following signature:
+     ```python
+     def get_data_collator(tokenizer):
+         # Define and return a custom collate_fn here
+     ```
+
+4. **Passing Additional Configuration Parameters**  
+   You can add custom arguments in `data_config.json`, which will be accessible via the `dataset_config` argument inside your `get_custom_dataset()` function.
+
+5. **Example `data_config.json` File**
+   ```json
+   {
+     "train_split": "train",
+     "test_split": "test",
+     "test_split_ratio": 0.15,
+     "preproc_file": "disc_preprocd.py:get_preprocessed_disc",
+     "collate_file": "disc_preprocd.py:get_collate_fn_disc",
+     "disc_style": "sarcasm_more"
+   }
+   ```
+
+6. **Implementing Custom Preprocessing Logic**  
+   Within your dataset loader function, define `apply_prompt_template()` to manipulate raw data into desired prompt format, and `tokenize()` to convert it into token IDs using the tokenizer.
+
+7. **Reference for Dataset Utilities**  
+   You can refer to existing implementations in the [dataset directory of this repository](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset).
+
+---

From 8d71ae221661314fbc7ccebc4a18bb8f2b85e2e7 Mon Sep 17 00:00:00 2001
From: Meet Patel <meetkuma@qti.qualcomm.com>
Date: Mon, 21 Jul 2025 16:30:20 +0530
Subject: [PATCH 2/8] Minor changes to data collator call to explicitly pass
 -100 for pad token.

Signed-off-by: meetkuma <meetkuma@qti.qualcomm.com>
---
 QEfficient/finetune/utils/dataset_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py
index 0fb325c8a..d3a7e7185 100644
--- a/QEfficient/finetune/utils/dataset_utils.py
+++ b/QEfficient/finetune/utils/dataset_utils.py
@@ -64,8 +64,9 @@ def get_dataloader_kwargs(train_config, dataset, dataset_processer, split):
             kwargs["drop_last"] = False
     else:
         kwargs["batch_size"] = batch_size
-        kwargs["drop_last"] = False
-    kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer)
+        kwargs["drop_last"] = True
+    # todo: -100 should be changed to a variable. or tokenizer.pad_token_id
+    kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer, label_pad_token_id=-100)
     return kwargs
 
 

From 2d7a88c4ca1c99bed3ed1de2f768041e16afa98a Mon Sep 17 00:00:00 2001
From: Meet Patel <meetkuma@qti.qualcomm.com>
Date: Mon, 21 Jul 2025 16:59:08 +0530
Subject: [PATCH 3/8] Removed redundant code changes based on recent merged
 PRs.

Signed-off-by: meetkuma <meetkuma@qti.qualcomm.com>
---
 QEfficient/cloud/finetune.py                  |  6 ++----
 QEfficient/finetune/dataset/custom_dataset.py | 18 +++++++++++-------
 QEfficient/finetune/utils/config_utils.py     |  2 +-
 QEfficient/finetune/utils/helper.py           | 11 -----------
 QEfficient/finetune/utils/parser.py           |  7 -------
 5 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
index ac69809a5..6a0d81d44 100644
--- a/QEfficient/cloud/finetune.py
+++ b/QEfficient/cloud/finetune.py
@@ -21,7 +21,6 @@
 from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 
 from QEfficient.finetune.configs.training import TrainConfig
-from QEfficient.finetune.utils.helper import parse_unk_args
 from QEfficient.finetune.utils.config_utils import (
     generate_dataset_config,
     generate_peft_config,
@@ -341,7 +340,6 @@ def main(**kwargs) -> None:
 
 if __name__ == "__main__":
     parser = get_finetune_parser()
-    args, unk_args = parser.parse_known_args()
-    unk_args_dict = parse_unk_args(unk_args)
+    args = parser.parse_args()
     args_dict = vars(args)
-    main(**args_dict, **unk_args_dict)
+    main(**args_dict)
diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
index f924a367f..a18447f29 100644
--- a/QEfficient/finetune/dataset/custom_dataset.py
+++ b/QEfficient/finetune/dataset/custom_dataset.py
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import importlib
+import logging
 from pathlib import Path
 
 from QEfficient.finetune.utils.logging_utils import logger
@@ -27,16 +28,17 @@ def load_module_from_py_file(py_file: str) -> object:
 
 def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None):
     if not hasattr(dataset_config, "preproc_file"):
-        raise RuntimeError("Can not find preproc_file key in dataset_config file.")
+        logger.raise_error("Can not find preproc_file key in dataset_config file.", RuntimeError)
 
     if ":" in dataset_config.preproc_file:
         module_path, func_name = dataset_config.preproc_file.split(":")
     else:
         module_path, func_name = dataset_config.preproc_file, "get_custom_dataset"
-        print(
+        logger.log_rank_zero(
             f"Using '{func_name}' function from "
             f"{dataset_config.preproc_file} as preprocessing function in "
-            "dataset preprocessing."
+            "dataset preprocessing.",
+            logging.WARNING,
         )
 
     if not module_path.endswith(".py"):
@@ -61,8 +63,9 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non
 
 def get_data_collator(dataset_processer, dataset_config):
     if not hasattr(dataset_config, "collate_file"):
-        print(
-            f"Can not find collate_file key in dataset_config file. Using the default data collator function instead."
+        logger.log_rank_zero(
+            "Can not find collate_file key in dataset_config file. Using the default data collator function instead.",
+            logging.WARNING,
         )
         return None
 
@@ -70,8 +73,9 @@ def get_data_collator(dataset_processer, dataset_config):
         module_path, func_name = dataset_config.collate_file.split(":")
     else:
         module_path, func_name = dataset_config.collate_file, "get_data_collator"
-        print(
-            f"Using '{func_name}' function from {dataset_config.collate_file} as collate_fn in dataset preprocessing."
+        logger.log_rank_zero(
+            f"Using '{func_name}' function from {dataset_config.collate_file} as collate_fn in dataset preprocessing.",
+            logging.WARNING,
         )
 
     if not module_path.endswith(".py"):
diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py
index 158c9d43b..c0c4c4386 100644
--- a/QEfficient/finetune/utils/config_utils.py
+++ b/QEfficient/finetune/utils/config_utils.py
@@ -8,9 +8,9 @@
 import inspect
 import json
 import os
+from collections import namedtuple
 from dataclasses import asdict
 from typing import Any, Dict, Optional
-from collections import namedtuple
 
 import yaml
 from peft import LoraConfig as PeftLoraConfig
diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py
index f92f022ee..fd584d8c0 100644
--- a/QEfficient/finetune/utils/helper.py
+++ b/QEfficient/finetune/utils/helper.py
@@ -227,14 +227,3 @@ def save_to_json(
     }
     with open(output_filename, "w") as f:
         json.dump(metrics_data, f)
-TASK_TYPE = ["generation", "seq_classification"]
-PEFT_METHOD = ["lora"]
-DEVICE = ["qaic", "cpu", "cuda"]
-BATCHING_STRATEGY = ["padding", "packing"]
-
-
-def parse_unk_args(unk_args_str):
-    if len(unk_args_str) % 2 != 0:
-        raise RuntimeError("Unknown arguments must be in pairs")
-    unk_args_dict = {unk_args_str[i].replace("--", ""): unk_args_str[i + 1] for i in range(0, len(unk_args_str), 2)}
-    return unk_args_dict
diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py
index ec8fb70d3..18dae8ec2 100644
--- a/QEfficient/finetune/utils/parser.py
+++ b/QEfficient/finetune/utils/parser.py
@@ -43,13 +43,6 @@ def get_finetune_parser():
         default=None,
         help="Name of the tokenizer,if not passed as an argument, it uses the value of model_name",
     )
-    parser.add_argument(
-        "--peft_config_file",
-        "--peft-config-file",
-        type=str,
-        default=None,
-        help="Path of PEFT config json file to override the PEFT config params such as lora_r, lora_alpha etc.",
-    )
     parser.add_argument(
         "--custom_dataset_config",
         "--custom-dataset-config",

From c1f6fd0b7affd07e4ae9b014339762fbcc5e6eea Mon Sep 17 00:00:00 2001
From: Meet Patel <meetkuma@qti.qualcomm.com>
Date: Mon, 21 Jul 2025 20:35:06 +0530
Subject: [PATCH 4/8] Added a sample custom dataset config, custom dataset
 preprocessing python code and peft config file.

Signed-off-by: meetkuma <meetkuma@qti.qualcomm.com>
---
 QEfficient/cloud/finetune.py                  |  4 +-
 .../finetune/configs/sample_peft_config.json  | 17 ++++
 .../dataset/custom_dataset/disc_preproc.py    | 87 +++++++++++++++++++
 .../custom_dataset/sample_dataset_config.json |  7 ++
 QEfficient/finetune/utils/config_utils.py     |  5 ++
 5 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 QEfficient/finetune/configs/sample_peft_config.json
 create mode 100644 QEfficient/finetune/dataset/custom_dataset/disc_preproc.py
 create mode 100644 QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json

diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
index 6a0d81d44..941024cb5 100644
--- a/QEfficient/cloud/finetune.py
+++ b/QEfficient/cloud/finetune.py
@@ -290,8 +290,8 @@ def main(**kwargs) -> None:
     """
     train_config = TrainConfig()
     update_config(train_config, **kwargs)
-    dataset_config_file = kwargs.pop("dataset_config", None)
-    dataset_config = generate_dataset_config(train_config.dataset, dataset_config_file)
+    custom_dataset_config_file = kwargs.pop("custom_dataset_config", None)
+    dataset_config = generate_dataset_config(train_config.dataset, custom_dataset_config_file)
 
     logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level)
 
diff --git a/QEfficient/finetune/configs/sample_peft_config.json b/QEfficient/finetune/configs/sample_peft_config.json
new file mode 100644
index 000000000..c53c9c9dd
--- /dev/null
+++ b/QEfficient/finetune/configs/sample_peft_config.json
@@ -0,0 +1,17 @@
+{
+    "r": 32,
+    "lora_alpha": 64,
+    "target_modules": [
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "up_proj",
+        "down_proj",
+        "gate_proj"
+    ],
+    "bias": "none",
+    "task_type": "CAUSAL_LM",
+    "lora_dropout": 0.05,
+    "inference_mode": false
+}
\ No newline at end of file
diff --git a/QEfficient/finetune/dataset/custom_dataset/disc_preproc.py b/QEfficient/finetune/dataset/custom_dataset/disc_preproc.py
new file mode 100644
index 000000000..d06a40b94
--- /dev/null
+++ b/QEfficient/finetune/dataset/custom_dataset/disc_preproc.py
@@ -0,0 +1,87 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+
+import datasets
+from transformers.data import DataCollatorForSeq2Seq
+
+
+def get_data_collator(tokenizer):
+    return DataCollatorForSeq2Seq(tokenizer)
+
+
+def get_preprocessed_disc(dataset_config, tokenizer, split, context_length=None):
+    dataset = datasets.load_dataset("hallisky/DiSC")
+
+    # Considering 'train' split as this dataset has only one split.
+    dataset = dataset["train"]
+
+    test_split_ratio = dataset_config.test_split_ratio
+    disc_style = dataset_config.disc_style
+
+    # Only collect the samples for a given style.
+    available_styles = set(dataset["category"])
+    if disc_style not in available_styles:
+        raise RuntimeError(f"For DiSC dataset the provided disc_stype '{disc_style}' is not supported.")
+
+    dataset = dataset.filter(lambda example: example["category"] == disc_style)
+
+    # Shuffle the dataset before splitting
+    dataset = dataset.shuffle(seed=42)
+
+    # Split the data in train and test split.
+    total_samples = len(dataset)
+    test_size = int(total_samples * test_split_ratio)
+    train_size = total_samples - test_size
+
+    if split == "test":
+        indices = range(train_size, total_samples)
+    else:
+        indices = range(0, train_size)
+
+    dataset = dataset.select(indices)
+
+    if tokenizer.pad_token is None:
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+    # Below is the template of the DiSC dataset.
+    # <bos>### Original:{original} \n ### Rewrite: {rewrite} <eos>
+    template = "### Original:{original} \n ### Rewrite: "
+
+    def apply_prompt_template(sample):
+        return {
+            "input": template.format(original=sample["original"]),
+            "label": sample["generation"],
+        }
+
+    dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
+
+    def tokenize_add_label(sample):
+        input = tokenizer.encode(
+            tokenizer.bos_token + sample["input"],
+            add_special_tokens=False,
+            max_length=context_length,
+            pad_to_max_length=True,
+        )
+        label = tokenizer.encode(
+            sample["label"] + tokenizer.pad_token + tokenizer.eos_token,
+            add_special_tokens=False,
+            max_length=context_length,
+            pad_to_max_length=True,
+        )
+
+        sample = {
+            "input_ids": (input + label),
+            "attention_mask": [1] * (len(input) + len(label)),
+            "labels": [-100] * len(input) + label,
+        }
+
+        return sample
+
+    dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))
+
+    return dataset
diff --git a/QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json
new file mode 100644
index 000000000..2f6be9bc7
--- /dev/null
+++ b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json
@@ -0,0 +1,7 @@
+{
+    "train_split": "train",
+    "test_split": "test",
+    "test_split_ratio": 0.15,
+    "preproc_file": "./QEfficient/finetune/dataset/custom_dataset/disc_preproc.py:get_preprocessed_disc",
+    "disc_style": "sarcasm_more"
+}
\ No newline at end of file
diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py
index c0c4c4386..0c8b3d827 100644
--- a/QEfficient/finetune/utils/config_utils.py
+++ b/QEfficient/finetune/utils/config_utils.py
@@ -107,6 +107,11 @@ def generate_dataset_config(dataset_name: str, custom_dataset_config: Optional[s
     # FIXME (Meet): Replace below logic by creating using auto registry of datasets.
     dataset_config = {k: v for k, v in inspect.getmembers(qeff_datasets)}[dataset_name]()
     if dataset_name == "custom_dataset":
+        if custom_dataset_config is None:
+            logger.raise_error(
+                "For 'custom_dataset', please provide dataset config file via 'custom_dataset_config' flag.",
+                RuntimeError,
+            )
         custom_dataset_dict = asdict(dataset_config)
         custom_dataset_dict_override = load_config_file(custom_dataset_config)
         # Override existing and add new params to dataset_config.

From ef4740e92218b5ba2bf3384e30f5b8df36361e6c Mon Sep 17 00:00:00 2001
From: Meet Patel <meetkuma@qti.qualcomm.com>
Date: Wed, 23 Jul 2025 16:06:26 +0530
Subject: [PATCH 5/8] Changed attention back to sdpa from eager.

Signed-off-by: meetkuma <meetkuma@qti.qualcomm.com>
---
 QEfficient/cloud/finetune.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
index 941024cb5..d8de58951 100644
--- a/QEfficient/cloud/finetune.py
+++ b/QEfficient/cloud/finetune.py
@@ -133,7 +133,7 @@ def load_model_and_tokenizer(
         model = AutoModelForSequenceClassification.from_pretrained(
             pretrained_model_path,
             num_labels=dataset_config.num_labels,
-            attn_implementation="eager",
+            attn_implementation="sdpa",
             torch_dtype=torch.float16,
         )
 
@@ -151,7 +151,7 @@ def load_model_and_tokenizer(
         model = AutoModelForCausalLM.from_pretrained(
             pretrained_model_path,
             use_cache=False,
-            attn_implementation="eager",
+            attn_implementation="sdpa",
             torch_dtype=torch.float16,
             device_map=device_map,
         )

From 172625759f4c70cb35964c2be6694e0fb11df838 Mon Sep 17 00:00:00 2001
From: Meet Patel <meetkuma@qti.qualcomm.com>
Date: Mon, 4 Aug 2025 14:48:10 +0530
Subject: [PATCH 6/8] Updated documentation based on review comments.

Signed-off-by: meetkuma <meetkuma@qti.qualcomm.com>
---
 QEfficient/finetune/dataset/custom_dataset.py |   4 +-
 ...c_preproc.py => sample_dataset_preproc.py} |   2 +-
 docs/source/finetune.md                       | 135 ++++++++++--------
 3 files changed, 79 insertions(+), 62 deletions(-)
 rename QEfficient/finetune/dataset/custom_dataset/{disc_preproc.py => sample_dataset_preproc.py} (98%)

diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
index a18447f29..6a8af0a23 100644
--- a/QEfficient/finetune/dataset/custom_dataset.py
+++ b/QEfficient/finetune/dataset/custom_dataset.py
@@ -33,7 +33,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non
     if ":" in dataset_config.preproc_file:
         module_path, func_name = dataset_config.preproc_file.split(":")
     else:
-        module_path, func_name = dataset_config.preproc_file, "get_custom_dataset"
+        module_path, func_name = dataset_config.preproc_file, "preproc_fn"
         logger.log_rank_zero(
             f"Using '{func_name}' function from "
             f"{dataset_config.preproc_file} as preprocessing function in "
@@ -72,7 +72,7 @@ def get_data_collator(dataset_processer, dataset_config):
     if ":" in dataset_config.collate_file:
         module_path, func_name = dataset_config.collate_file.split(":")
     else:
-        module_path, func_name = dataset_config.collate_file, "get_data_collator"
+        module_path, func_name = dataset_config.collate_file, "data_collator_fn"
         logger.log_rank_zero(
             f"Using '{func_name}' function from {dataset_config.collate_file} as collate_fn in dataset preprocessing.",
             logging.WARNING,
diff --git a/QEfficient/finetune/dataset/custom_dataset/disc_preproc.py b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py
similarity index 98%
rename from QEfficient/finetune/dataset/custom_dataset/disc_preproc.py
rename to QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py
index d06a40b94..78db5674c 100644
--- a/QEfficient/finetune/dataset/custom_dataset/disc_preproc.py
+++ b/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py
@@ -26,7 +26,7 @@ def get_preprocessed_disc(dataset_config, tokenizer, split, context_length=None)
     # Only collect the samples for a given style.
     available_styles = set(dataset["category"])
     if disc_style not in available_styles:
-        raise RuntimeError(f"For DiSC dataset the provided disc_stype '{disc_style}' is not supported.")
+        raise RuntimeError(f"For DiSC dataset the provided disc_style '{disc_style}' is not supported.")
 
     dataset = dataset.filter(lambda example: example["category"] == disc_style)
 
diff --git a/docs/source/finetune.md b/docs/source/finetune.md
index 8bb4d58fa..f8519d8c3 100644
--- a/docs/source/finetune.md
+++ b/docs/source/finetune.md
@@ -78,64 +78,81 @@ tensorboard --logdir runs/<file> --bind_all
 
 ### 🔧 Steps to Fine-Tune with a Custom Dataset
 
-1. **Launching Fine-Tuning with a Custom Dataset**  
-   Use the following command-line arguments to begin fine-tuning:
-   ```
-   --dataset custom_dataset --dataset_config data_config.json
-   ```
-   The `data_config.json` file contains essential parameters used during dataset preprocessing.
-
-2. **Specifying the Preprocessing Function**  
-   - In `data_config.json`, include a `"preproc_file"` key to define the path to your preprocessing Python file.
-   - To specify a custom function within that file, use the format `"filename.py:function_name"`.  
-     _Example:_  
-     ```json
-     "preproc_file": "disc_preproc.py:get_preprocessed_disc"
-     ```
-   - Your preprocessing function must follow this structure:
-     ```python
-     def get_custom_dataset(dataset_config, tokenizer, split, context_length=None):
-         def apply_prompt_template():
-             # Apply prompt formatting to each datapoint
-
-         def tokenize():
-             # Tokenize the formatted datapoint
-
-         # Apply functions to dataset using map
-         dataset = dataset.map(apply_prompt_template, ...)
-         dataset = dataset.map(tokenize, ...)
+1.  **Launching Fine-Tuning with a Custom Dataset**
+    -   Use the following command-line arguments to begin fine-tuning using a custom dataset:
+        ```bash
+        --dataset custom_dataset --dataset_config data_config.json
+        ```
+    -   The `--dataset_config` argument is mandatory when `--dataset custom_dataset` is specified. The `data_config.json` file contains essential parameters used during dataset preprocessing.
+
+        __Example `data_config.json` File__
+        ```json
+        {
+        "train_split": "train",
+        "test_split": "test",
+        "test_split_ratio": 0.15,
+        "preproc_file": "sample_dataset_preproc.py:preprocessing_fn",
+        "collate_file": "sample_dataset_preproc.py:data_collate_fn",
+        "disc_style": "sarcasm_more"
+        }
+        ```
+
+2.  **Specifying the Preprocessing Function**
+    -   In `data_config.json`, include a `"preproc_file"` key to define the path to your preprocessing Python file and the function within it.
+    -   Use the format `"filename.py:function_name"`. In the absence of `function_name`, the `preproc_fn` function will be used as default preprocessing function from the `filename.py`.
+        _Example:_
+        ```json
+        "preproc_file": "sample_dataset_preproc.py:preprocessing_fn"
+        ```
+    -   The preprocessing function must follow the structure below. The signature of the function should not be altered. The sample illustrates `apply_prompt_template` and `tokenize` as sub-functions, but we can define our own sub-functions as needed. For reference, check the example files in the [./QEfficient/finetune/dataset/](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset) directory.
+        ```python
+        def preprocessing_fn(dataset_config, tokenizer, split, context_length=None):
+            # Load the dataset or read from the disk
+            # ...
+
+            # Split the dataset into train and test splits if needed,
+            # and use the appropriate split based on the 'split' argument.
+            # ...
+
+            def apply_prompt_template(example):
+                # Apply prompt formatting to each datapoint (e.g., example)
+                # ...
+                return example # Return the processed example
+
+            def tokenize(example):
+                # Tokenize the formatted datapoint (e.g., example)
+                # ...
+                return tokenizer(example["text"], truncation=True, max_length=context_length) # Example tokenization
+
+            # Apply prompt template to preprocess it in accordance to the dataset and task.
+            dataset = dataset.map(apply_prompt_template, ...)
+
+            # Finally, tokenize the dataset
+            dataset = dataset.map(tokenize, batched=True, remove_columns=['text']) # Example batched tokenization
+            
+            # Each sample in the dataset should have keys acceptable by the HF
+            # model and the loss function.
+            # Typically, for CausalLM models used with 'generation' task_mode,
+            # the keys should be 'input_ids', 'attention_mask', and 'labels'.
+            return dataset
+        ```
+        -   In the sample preprocessing function above, the `split` variable takes its value from `data_config.json`. For the training dataset, the value will be taken from the `"train_split"` key, and for the evaluation/test dataset, it will be taken from the `"test_split"` key.
+        -   Additional arguments needed for the preprocessing function can be passed in `data_config.json` and will be available via the `dataset_config` variable within the function. For instance, in the sample config above, `"test_split_ratio"` and `"disc_style"` keys can be used in the preprocessing function to define the test split ratio and style of the dataset. These values are accessed through the `dataset_config` variable. Check out the sample preprocessing file at [./QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py).
+
+
+
+3.  **Custom Collate Function for Batching**
+    -   When using a batch size greater than 1, we may need to override the default collate (batching different samples together in a batch) behavior by including a `"collate_file"` key in `data_config.json`.
+    -   Use the same `"file.py:function"` format. If omitted, the default Hugging Face `DataCollatorForSeq2Seq` is typically used, which pads sequences to the longest length in the batch.
+    -   A custom collate function must have the following signature. The signature of the function should not be altered:
+        ```python
+        def get_data_collator(tokenizer):
+            # Define and return a custom collate_fn here
+            # ...
          
-         return dataset
-     ```
-
-3. **Custom Collate Function for Batching**  
-   - When using a batch size greater than 1, you may override the default collate behavior by including a `"collate_file"` key in `data_config.json`.
-   - Use the same `"file.py:function"` format. If omitted, the default Hugging Face `DataCollatorForSeq2Seq` is used, which pads sequences to the longest length in the batch.
-   - A custom collate function must have the following signature:
-     ```python
-     def get_data_collator(tokenizer):
-         # Define and return a custom collate_fn here
-     ```
-
-4. **Passing Additional Configuration Parameters**  
-   You can add custom arguments in `data_config.json`, which will be accessible via the `dataset_config` argument inside your `get_custom_dataset()` function.
-
-5. **Example `data_config.json` File**
-   ```json
-   {
-     "train_split": "train",
-     "test_split": "test",
-     "test_split_ratio": 0.15,
-     "preproc_file": "disc_preprocd.py:get_preprocessed_disc",
-     "collate_file": "disc_preprocd.py:get_collate_fn_disc",
-     "disc_style": "sarcasm_more"
-   }
-   ```
-
-6. **Implementing Custom Preprocessing Logic**  
-   Within your dataset loader function, define `apply_prompt_template()` to manipulate raw data into desired prompt format, and `tokenize()` to convert it into token IDs using the tokenizer.
-
-7. **Reference for Dataset Utilities**  
-   You can refer to existing implementations in the [dataset directory of this repository](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset).
-
+            # This function should take a list of samples and return a batch.
+            # Example:
+            # from transformers import DataCollatorForLanguageModeling
+            # return DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+        ```
 ---

From f42e5b8120ff7250dd6d711dc78cd9f70bbdd9fa Mon Sep 17 00:00:00 2001
From: Meet Patel <meetkuma@qti.qualcomm.com>
Date: Mon, 4 Aug 2025 19:00:48 +0530
Subject: [PATCH 7/8] Made preproc file name and function name as mandatory
 args and updated documentation accordingly.

Signed-off-by: meetkuma <meetkuma@qti.qualcomm.com>
---
 QEfficient/finetune/dataset/custom_dataset.py | 36 +++++++++++--------
 QEfficient/finetune/dataset/dataset_config.py |  3 +-
 docs/source/finetune.md                       |  4 +--
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
index 6a8af0a23..ef76e83ed 100644
--- a/QEfficient/finetune/dataset/custom_dataset.py
+++ b/QEfficient/finetune/dataset/custom_dataset.py
@@ -30,17 +30,18 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non
     if not hasattr(dataset_config, "preproc_file"):
         logger.raise_error("Can not find preproc_file key in dataset_config file.", RuntimeError)
 
-    if ":" in dataset_config.preproc_file:
-        module_path, func_name = dataset_config.preproc_file.split(":")
-    else:
-        module_path, func_name = dataset_config.preproc_file, "preproc_fn"
-        logger.log_rank_zero(
-            f"Using '{func_name}' function from "
-            f"{dataset_config.preproc_file} as preprocessing function in "
-            "dataset preprocessing.",
-            logging.WARNING,
+    if ":" not in dataset_config.preproc_file:
+        logger.raise_error(
+            "The 'preproc_file' key in dataset_config file should follow the format: python_file_path:function_name",
+            RuntimeError,
         )
 
+    module_path, func_name = dataset_config.preproc_file.split(":")
+    logger.log_rank_zero(
+        f"Using '{func_name}' function from {module_path} as preprocessing function in dataset preprocessing.",
+        logging.DEBUG,
+    )
+
     if not module_path.endswith(".py"):
         logger.raise_error(f"Custom dataset preprocessing file {module_path} is not a .py file.", ValueError)
 
@@ -69,14 +70,21 @@ def get_data_collator(dataset_processer, dataset_config):
         )
         return None
 
-    if ":" in dataset_config.collate_file:
-        module_path, func_name = dataset_config.collate_file.split(":")
-    else:
-        module_path, func_name = dataset_config.collate_file, "data_collator_fn"
+    if ":" not in dataset_config.collate_file:
         logger.log_rank_zero(
-            f"Using '{func_name}' function from {dataset_config.collate_file} as collate_fn in dataset preprocessing.",
+            "Can not find function name in 'collate_file' key in dataset_config "
+            "file. Using the default data collator function instead. If this is "
+            "not intended then change the format of the 'collate_file' key in "
+            "dataset_config file to follow the format: python_file_path:function_name",
             logging.WARNING,
         )
+        return None
+    else:
+        module_path, func_name = dataset_config.collate_file.split(":")
+        logger.log_rank_zero(
+            f"Using '{func_name}' function from {module_path} as collate_fn in dataset preprocessing.",
+            logging.DEBUG,
+        )
 
     if not module_path.endswith(".py"):
         logger.raise_error(f"Custom dataset collate file {module_path} is not a .py file.", ValueError)
diff --git a/QEfficient/finetune/dataset/dataset_config.py b/QEfficient/finetune/dataset/dataset_config.py
index 2e477be77..b766e923c 100644
--- a/QEfficient/finetune/dataset/dataset_config.py
+++ b/QEfficient/finetune/dataset/dataset_config.py
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-from functools import partial
 
 from QEfficient.finetune.dataset.alpaca_dataset import (
     InstructionDataset as get_alpaca_dataset,
@@ -23,7 +22,7 @@
 )
 
 DATASET_PREPROC = {
-    "alpaca_dataset": partial(get_alpaca_dataset),
+    "alpaca_dataset": get_alpaca_dataset,
     "grammar_dataset": get_grammar_dataset,
     "gsm8k_dataset": get_gsm8k_dataset,
     "custom_dataset": get_custom_dataset,
diff --git a/docs/source/finetune.md b/docs/source/finetune.md
index f8519d8c3..2f05282a8 100644
--- a/docs/source/finetune.md
+++ b/docs/source/finetune.md
@@ -98,8 +98,8 @@ tensorboard --logdir runs/<file> --bind_all
         ```
 
 2.  **Specifying the Preprocessing Function**
-    -   In `data_config.json`, include a `"preproc_file"` key to define the path to your preprocessing Python file and the function within it.
-    -   Use the format `"filename.py:function_name"`. In the absence of `function_name`, the `preproc_fn` function will be used as default preprocessing function from the `filename.py`.
+    -   In `data_config.json`, include a `"preproc_file"` mandatory key to define the path to your preprocessing Python file and the function within it.
+    -   Use the format `"filename.py:function_name"`. The filename and function name both are required. 
         _Example:_
         ```json
         "preproc_file": "sample_dataset_preproc.py:preprocessing_fn"

From bfad6f0671e072ab836bcaf62a0a637315293e21 Mon Sep 17 00:00:00 2001
From: Meet Patel <meetkuma@qti.qualcomm.com>
Date: Tue, 5 Aug 2025 10:18:14 +0530
Subject: [PATCH 8/8] Minor correction to the documentation.

Signed-off-by: meetkuma <meetkuma@qti.qualcomm.com>
---
 docs/source/finetune.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/source/finetune.md b/docs/source/finetune.md
index 2f05282a8..311605709 100644
--- a/docs/source/finetune.md
+++ b/docs/source/finetune.md
@@ -104,7 +104,7 @@ tensorboard --logdir runs/<file> --bind_all
         ```json
         "preproc_file": "sample_dataset_preproc.py:preprocessing_fn"
         ```
-    -   The preprocessing function must follow the structure below. The signature of the function should not be altered. The sample illustrates `apply_prompt_template` and `tokenize` as sub-functions, but we can define our own sub-functions as needed. For reference, check the example files in the [./QEfficient/finetune/dataset/](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset) directory.
+    -   The preprocessing function must follow the structure below. The function parameters and the return type of the function should not be altered. The sample illustrates `apply_prompt_template` and `tokenize` as sub-functions, but we can define our own sub-functions as needed. For reference, check the example files in the [./QEfficient/finetune/dataset/](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset) directory.
         ```python
         def preprocessing_fn(dataset_config, tokenizer, split, context_length=None):
             # Load the dataset or read from the disk
@@ -139,12 +139,10 @@ tensorboard --logdir runs/<file> --bind_all
         -   In the sample preprocessing function above, the `split` variable takes its value from `data_config.json`. For the training dataset, the value will be taken from the `"train_split"` key, and for the evaluation/test dataset, it will be taken from the `"test_split"` key.
         -   Additional arguments needed for the preprocessing function can be passed in `data_config.json` and will be available via the `dataset_config` variable within the function. For instance, in the sample config above, `"test_split_ratio"` and `"disc_style"` keys can be used in the preprocessing function to define the test split ratio and style of the dataset. These values are accessed through the `dataset_config` variable. Check out the sample preprocessing file at [./QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py).
 
-
-
 3.  **Custom Collate Function for Batching**
     -   When using a batch size greater than 1, we may need to override the default collate (batching different samples together in a batch) behavior by including a `"collate_file"` key in `data_config.json`.
     -   Use the same `"file.py:function"` format. If omitted, the default Hugging Face `DataCollatorForSeq2Seq` is typically used, which pads sequences to the longest length in the batch.
-    -   A custom collate function must have the following signature. The signature of the function should not be altered:
+    -   A custom collate function must follow the structure below. The function parameters and the return type of the function should not be altered:
         ```python
         def get_data_collator(tokenizer):
             # Define and return a custom collate_fn here