-
Notifications
You must be signed in to change notification settings - Fork 46
[QEff. Finetune] Updated handling of custom dataset in FT. Updated finetune.md readme file. #520
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
a8e6a4a
8d71ae2
2d7a88c
c1f6fd0
ef4740e
1726257
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
{ | ||
"r": 32, | ||
"lora_alpha": 64, | ||
"target_modules": [ | ||
"q_proj", | ||
"k_proj", | ||
"v_proj", | ||
"o_proj", | ||
"up_proj", | ||
"down_proj", | ||
"gate_proj" | ||
], | ||
"bias": "none", | ||
"task_type": "CAUSAL_LM", | ||
"lora_dropout": 0.05, | ||
"inference_mode": false | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"train_split": "train", | ||
"test_split": "test", | ||
"test_split_ratio": 0.15, | ||
"preproc_file": "./QEfficient/finetune/dataset/custom_dataset/disc_preproc.py:get_preprocessed_disc", | ||
"disc_style": "sarcasm_more" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# ----------------------------------------------------------------------------- | ||
# | ||
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. | ||
# SPDX-License-Identifier: BSD-3-Clause | ||
# | ||
# ----------------------------------------------------------------------------- | ||
|
||
|
||
import datasets | ||
from transformers.data import DataCollatorForSeq2Seq | ||
|
||
|
||
def get_data_collator(tokenizer): | ||
return DataCollatorForSeq2Seq(tokenizer) | ||
|
||
|
||
def get_preprocessed_disc(dataset_config, tokenizer, split, context_length=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please pass only kwargs here so that the user has the flexibity to pass and use whichever arguments they need. |
||
dataset = datasets.load_dataset("hallisky/DiSC") | ||
|
||
# Considering 'train' split as this dataset has only one split. | ||
dataset = dataset["train"] | ||
|
||
test_split_ratio = dataset_config.test_split_ratio | ||
disc_style = dataset_config.disc_style | ||
|
||
# Only collect the samples for a given style. | ||
available_styles = set(dataset["category"]) | ||
if disc_style not in available_styles: | ||
raise RuntimeError(f"For DiSC dataset the provided disc_style '{disc_style}' is not supported.") | ||
|
||
dataset = dataset.filter(lambda example: example["category"] == disc_style) | ||
|
||
# Shuffle the dataset before splitting | ||
dataset = dataset.shuffle(seed=42) | ||
|
||
# Split the data in train and test split. | ||
total_samples = len(dataset) | ||
test_size = int(total_samples * test_split_ratio) | ||
train_size = total_samples - test_size | ||
|
||
if split == "test": | ||
indices = range(train_size, total_samples) | ||
else: | ||
indices = range(0, train_size) | ||
|
||
dataset = dataset.select(indices) | ||
|
||
if tokenizer.pad_token is None: | ||
tokenizer.add_special_tokens({"pad_token": "[PAD]"}) | ||
|
||
# Below is the template of the DiSC dataset. | ||
# <bos>### Original:{original} \n ### Rewrite: {rewrite} <eos> | ||
template = "### Original:{original} \n ### Rewrite: " | ||
|
||
def apply_prompt_template(sample): | ||
return { | ||
"input": template.format(original=sample["original"]), | ||
"label": sample["generation"], | ||
} | ||
|
||
dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features)) | ||
|
||
def tokenize_add_label(sample): | ||
input = tokenizer.encode( | ||
tokenizer.bos_token + sample["input"], | ||
add_special_tokens=False, | ||
max_length=context_length, | ||
pad_to_max_length=True, | ||
) | ||
label = tokenizer.encode( | ||
sample["label"] + tokenizer.pad_token + tokenizer.eos_token, | ||
add_special_tokens=False, | ||
max_length=context_length, | ||
pad_to_max_length=True, | ||
) | ||
|
||
sample = { | ||
"input_ids": (input + label), | ||
"attention_mask": [1] * (len(input) + len(label)), | ||
"labels": [-100] * len(input) + label, | ||
} | ||
|
||
return sample | ||
|
||
dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features)) | ||
|
||
return dataset |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,13 +8,14 @@ | |
import inspect | ||
import json | ||
import os | ||
from collections import namedtuple | ||
from dataclasses import asdict | ||
from typing import Any, Dict | ||
from typing import Any, Dict, Optional | ||
|
||
import yaml | ||
from peft import LoraConfig as PeftLoraConfig | ||
|
||
import QEfficient.finetune.configs.dataset_config as datasets | ||
import QEfficient.finetune.configs.dataset_config as qeff_datasets | ||
from QEfficient.finetune.configs.peft_config import LoraConfig | ||
from QEfficient.finetune.configs.training import TrainConfig | ||
from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC | ||
|
@@ -86,11 +87,14 @@ def generate_peft_config(train_config: TrainConfig, **kwargs) -> Any: | |
return peft_config | ||
|
||
|
||
def generate_dataset_config(dataset_name: str) -> Any: | ||
def generate_dataset_config(dataset_name: str, custom_dataset_config: Optional[str] = None) -> Any: | ||
"""Generate a dataset configuration based on the specified dataset. | ||
Args: | ||
dataset_name (str): Name of the dataset to be used for finetuning. | ||
custom_dataset_config (str): Dataset config json file for custom datset. | ||
This file contains dataset specific arguments to be used in dataset | ||
preprocessing step. | ||
Returns: | ||
Any: A dataset configuration object. | ||
|
@@ -101,7 +105,20 @@ def generate_dataset_config(dataset_name: str) -> Any: | |
supported_datasets = DATASET_PREPROC.keys() | ||
assert dataset_name in supported_datasets, f"Given dataset '{dataset_name}' is not supported." | ||
# FIXME (Meet): Replace below logic by creating using auto registry of datasets. | ||
dataset_config = {k: v for k, v in inspect.getmembers(datasets)}[dataset_name]() | ||
dataset_config = {k: v for k, v in inspect.getmembers(qeff_datasets)}[dataset_name]() | ||
if dataset_name == "custom_dataset": | ||
if custom_dataset_config is None: | ||
logger.raise_error( | ||
"For 'custom_dataset', please provide dataset config file via 'custom_dataset_config' flag.", | ||
RuntimeError, | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we not handling the changes done in this method in the update_config() instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, the dataset_config which is defined at L108 is not a dict but a dataclass. In order to update the params of the existing dataclass we need to convert that into dict, add new keys from custom_dataset_config file and update it and then again construct the dataclass. This code it doing that. |
||
custom_dataset_dict = asdict(dataset_config) | ||
quic-meetkuma marked this conversation as resolved.
Show resolved
Hide resolved
|
||
custom_dataset_dict_override = load_config_file(custom_dataset_config) | ||
# Override existing and add new params to dataset_config. | ||
custom_dataset_dict.update(custom_dataset_dict_override) | ||
|
||
custom_dataset_class = namedtuple("custom_dataset", custom_dataset_dict.keys()) | ||
dataset_config = custom_dataset_class(**custom_dataset_dict) | ||
return dataset_config | ||
|
||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.