From 6c4990f1d3c76ada2365baf3d2be31166956a830 Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Tue, 10 Jun 2025 06:30:01 +0000 Subject: [PATCH 1/7] fixing utilities --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5f514f7..00ae19d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ *.ipynb -*.parquet \ No newline at end of file +*.parquet +dataset/ +models/ +local_util/ \ No newline at end of file From 219f7b42f310922e7dd7ce6553ed32573f7349d3 Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Tue, 10 Jun 2025 06:35:23 +0000 Subject: [PATCH 2/7] restructuring --- src/train/SeleKT/run.sh | 0 src/train/{ => SeleKT}/selekt.py | 0 src/train/{ => configs}/ds_config.json | 0 src/train/{ => configs}/general_acc.yaml | 0 src/train/{ => lora}/lora.py | 0 src/train/{ => lora}/merge_lora.py | 0 src/train/lora/run.sh | 0 src/train/sft/run.sh | 0 src/train/{ => sft}/sft.py | 0 9 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/train/SeleKT/run.sh rename src/train/{ => SeleKT}/selekt.py (100%) rename src/train/{ => configs}/ds_config.json (100%) rename src/train/{ => configs}/general_acc.yaml (100%) rename src/train/{ => lora}/lora.py (100%) rename src/train/{ => lora}/merge_lora.py (100%) create mode 100644 src/train/lora/run.sh create mode 100644 src/train/sft/run.sh rename src/train/{ => sft}/sft.py (100%) diff --git a/src/train/SeleKT/run.sh b/src/train/SeleKT/run.sh new file mode 100644 index 0000000..e69de29 diff --git a/src/train/selekt.py b/src/train/SeleKT/selekt.py similarity index 100% rename from src/train/selekt.py rename to src/train/SeleKT/selekt.py diff --git a/src/train/ds_config.json b/src/train/configs/ds_config.json similarity index 100% rename from src/train/ds_config.json rename to src/train/configs/ds_config.json diff --git a/src/train/general_acc.yaml b/src/train/configs/general_acc.yaml similarity index 100% rename from src/train/general_acc.yaml rename to src/train/configs/general_acc.yaml diff --git a/src/train/lora.py b/src/train/lora/lora.py similarity index 100% rename from src/train/lora.py rename to src/train/lora/lora.py diff --git a/src/train/merge_lora.py b/src/train/lora/merge_lora.py similarity index 100% rename from src/train/merge_lora.py rename to src/train/lora/merge_lora.py diff --git a/src/train/lora/run.sh b/src/train/lora/run.sh new file mode 100644 index 0000000..e69de29 diff --git a/src/train/sft/run.sh b/src/train/sft/run.sh new file mode 100644 index 0000000..e69de29 diff --git a/src/train/sft.py b/src/train/sft/sft.py similarity index 100% rename from src/train/sft.py rename to src/train/sft/sft.py From 06ac9a5664706a68d30458ba7706d89cd567a14a Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Tue, 10 Jun 2025 06:52:38 +0000 Subject: [PATCH 3/7] dynamic conversational-train loading in progress --- src/train/SeleKT/selekt.py | 8 +++-- src/train/sft/run.sh | 63 ++++++++++++++++++++++++++++++++++++++ src/train/sft/sft.py | 11 ++++--- 3 files changed, 75 insertions(+), 7 deletions(-) diff --git a/src/train/SeleKT/selekt.py b/src/train/SeleKT/selekt.py index b7c987b..76936d9 100644 --- a/src/train/SeleKT/selekt.py +++ b/src/train/SeleKT/selekt.py @@ -300,8 +300,10 @@ def train(args): print(f'Resuming from checkpoint: {last_checkpoint}') - # response_template = "#RESPONSE\n" - # collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) + collator = None + if args.is_conversational_training: + response_template = "#RESPONSE\n" + collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) callback = Callback(base_model_path=args.base_model_path, flush_steps=1, alpha=args.alpha) trainer = SFTTrainer( @@ -310,7 +312,7 @@ def train(args): train_dataset=dataset, args=training_config, callbacks=[callback], - # data_collator=collator, + data_collator=collator, ) callback.set_trainer(trainer) print(f"Starting training for epoch {args.num_train_epochs}") diff --git a/src/train/sft/run.sh b/src/train/sft/run.sh index e69de29..54d7a04 100644 --- a/src/train/sft/run.sh +++ b/src/train/sft/run.sh @@ -0,0 +1,63 @@ +#!/bin/bash + + +export MODEL_NAME="" +export DESC="" + +OUTPUT_DIR="" +TRAIN_DATA="" +MODEL_PATH="" + +mkdir -p $OUTPUT_DIR + +accelerate launch \ + --config_file=../configs/general_acc.yaml \ + sft.py \ + --model_name_or_path "$MODEL_PATH" \ + --train_data_path "$TRAIN_DATA" \ + --output_dir ${OUTPUT_DIR} \ + --num_train_epochs 3 \ + --model_max_length 16384 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --save_strategy "epoch" \ + --save_steps 760 \ + --save_total_limit 25 \ + --learning_rate 1e-5 \ + --warmup_ratio 0.1 \ + --weight_decay 0.1 \ + --logging_steps 5 \ + --lr_scheduler_type "cosine" \ + --report_to "wandb" \ + --gradient_checkpointing True \ + --deepspeed ../configs/ds_config.json \ + --bf16 True \ + --run_name "" \ + + + +accelerate launch \ + --config_file=../configs/general_acc.yaml \ + sft.py \ + --model_name_or_path "${MODEL_PATH}" \ + --train_data_path "$TRAIN_DATA" \ + --output_dir ${OUTPUT_DIR} \ + --num_train_epochs 3 \ + --model_max_length 16384 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --save_strategy "epoch" \ + --save_steps 760 \ + --save_total_limit 25 \ + --learning_rate 1e-5 \ + --warmup_ratio 0.1 \ + --weight_decay 0.1 \ + --logging_steps 5 \ + --lr_scheduler_type "cosine" \ + --report_to "wandb" \ + --gradient_checkpointing True \ + --deepspeed ../configs/ds_config.json \ + --bf16 True \ + --run_name "" \ + --is_conversational_training \ + diff --git a/src/train/sft/sft.py b/src/train/sft/sft.py index ebdd7e0..0b3f17f 100644 --- a/src/train/sft/sft.py +++ b/src/train/sft/sft.py @@ -65,6 +65,8 @@ def parse_args(): parser.add_argument("--debug", type=bool, default=False) parser.add_argument("--packing", type=bool, default=True, help="Whether to use packing for training") + parser.add_argument("--is_conversational_training", type=bool, action='store_true', + help="Whether to use conversational training format") args, _ = parser.parse_known_args() return args @@ -108,7 +110,6 @@ def __init__(self, flush_steps=None): self.flush_steps = flush_steps def on_step_end(self, args, state, control, model, processing_class , **kwargs): - # import sys; sys.exit(0) if state.global_step % self.flush_steps == 0: get_accelerator().empty_cache() if dist.is_initialized(): @@ -172,8 +173,10 @@ def main(): if last_checkpoint: print(f'Resuming from checkpoint: {last_checkpoint}') - # response_template = "#RESPONSE\n" - # collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) + collator = None + if args.is_conversational_training: + response_template = "#RESPONSE\n" + collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) # Initialize trainer trainer = SFTTrainer( @@ -182,7 +185,7 @@ def main(): train_dataset=dataset, args=training_config, callbacks=[Callback(flush_steps=1)], - # data_collator=collator, + data_collator=collator, ) # Start training From 152352ba1ad48ee2b24af19727354c51a0e9d50d Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Tue, 10 Jun 2025 08:40:30 +0000 Subject: [PATCH 4/7] combining workflow --- src/train/README.md | 137 ++++--------------------------- src/train/SeleKT/run.sh | 112 +++++++++++++++++++++++++ src/train/SeleKT/selekt.py | 5 +- src/train/lora/lora.py | 12 ++- src/train/lora/merge_lora.py | 77 +++++++++++++++--- src/train/lora/run.sh | 154 +++++++++++++++++++++++++++++++++++ src/train/sft/run.sh | 69 +++++++++++++--- src/train/sft/sft.py | 3 +- 8 files changed, 423 insertions(+), 146 deletions(-) diff --git a/src/train/README.md b/src/train/README.md index 64b81b0..cf0b489 100644 --- a/src/train/README.md +++ b/src/train/README.md @@ -1,12 +1,10 @@ # Model Training scripts ## Folder Structure -- `ds_config.json` contains the deepspeed configuration -- `general_acc.yaml` contains the accelerate configuration (might need to be modified as per desired system) -- `lora.py` contains the code for training model with LoRA -- `merge_lora.py` contains the code for merging trained LoRA adapters back to model for inference -- `seletkt.py` contains the code for training model with our algorithm explained in our paper -- `sft.py` contains the code for training model with Full Supervised Finetuning +- `configs` contains the deepspeed and accelerate configurations (modifialbe as per the system) +- `lora` contains the code for training model with LoRA +- `seletkt` contains the code for training model with SeleKT algorithm explained in our paper +- `sft` contains the code for training model with Full Supervised Finetuning ## Usgae ### Preparing the dataset @@ -23,122 +21,23 @@ ### Training with SFT - modify or replace the `general_acc.yaml` file as per the desired system configuration - set the `zero_optimization-stage` to `3` and `overlap_comm` to `false` in `ds_config` for better memory optimizations -- Run the following command to start training - ```bash - deepspeed sft.py \ - --model_name_or_path "path to pretrained LLM" \ - --train_data_path "path to training data" \ - --output_dir "path to output dir" \ - --num_train_epochs 3 \ - --model_max_length 8192 \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --save_strategy "epoch" \ - --save_steps 760 \ - --save_total_limit 25 \ - --learning_rate 1e-5 \ - --warmup_ratio 0.1 \ - --logging_steps 5 \ - --report_to "wandb" \ - --gradient_checkpointing True \ - --deepspeed ds_config.json \ - --bf16 True \ - --run_name "Run name for logs" \ - --debug True \ - ``` - Update the above command as per the model -- To train on conversation data by only applying loss on the response, uncomment the lines 175, 176 and 185 and run the same command with proper conversational dataset path - ```python - response_template = "#RESPONSE\n" - collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) - - # Initialize trainer - trainer = SFTTrainer( - model=model, - processing_class=tokenizer, - train_dataset=dataset, - args=training_config, - callbacks=[Callback(flush_steps=1)], - data_collator=collator, # pass the collator in the trainer - ) - ``` +- Add the respecitive variables like `MODEL_PATH`, `TRAIN_DATA`, `OUTPUT_DIR` etc. in the `run.sh` script and run +```bash +bash ./sft/run.sh +``` ### Training with LoRA - modify or replace the `general_acc.yaml` file as per the desired system configuration -- set the `zero_optimization-stage` to `2` and `overlap_comm` to `false` in `ds_config` for better memory optimizations -- Run the following command to start training - ```bash - deepspeed lora.py \ - --model_name_or_path "path to pretrained LLM" \ - --train_data_path "path to training data" \ - --output_dir "path to output dir" \ - --num_train_epochs 3 \ - --model_max_length 8192 \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --save_strategy "epoch" \ - --save_steps 760 \ - --save_total_limit 25 \ - --learning_rate 1e-5 \ - --warmup_ratio 0.1 \ - --logging_steps 5 \ - --report_to "wandb" \ - --gradient_checkpointing True \ - --deepspeed ds_config.json \ - --bf16 True \ - --run_name "Run name for logs" \ - --debug True \ - ``` - Update the above command as per the model -- Put the path of output LoRA adapters inside `merge_lora.py` and run following to get the final checkpoints - ```bash - python merge_lora.py - ``` +- set the `zero_optimization-stage` to `2` and `overlap_comm` to `false` in `ds_config` +- Add the respecitive variables like `MODEL_PATH`, `TRAIN_DATA`, `OUTPUT_DIR` etc. in the `run.sh` script and run +```bash +bash ./lora/run.sh +``` +>`lora/lora.py` uses `use_reentrant: True` for gradient checkpointing, and this can allow using deepspeed zero-3 optimization for large models. ### Training with SeleKT - modify or replace the `general_acc.yaml` file as per the desired system configuration -- set the `zero_optimization-stage` to `2` and `overlap_comm` to `false` in `ds_config` for better memory optimizations -- Run the following command to start training - ```bash - accelerate launch \ - --config_file=general_acc.yaml \ - selekt.py \ - --model_name_or_path "path to pretrained LLM" \ - --base_model_path "path to pretrained LLM" \ - --train_data_path "path to training data" \ - --output_dir "path to output directory" \ - --num_train_epochs 3 \ - --model_max_length 8192 \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --save_strategy "steps" \ - --save_steps "Enter the periodicity value M for seleKT" \ - --save_total_limit 50 \ - --learning_rate 1e-5 \ - --warmup_ratio 0.1 \ - --logging_steps 5 \ - --report_to "wandb" \ - --gradient_checkpointing True \ - --deepspeed ds_config.json \ - --bf16 True \ - --run_name "Name for logs" \ - --debug True \ - --alpha "Enter value for desired alpha parameter for SeleKT" \ - ``` - Update the above command as per the model -- To train on conversation data by only applying loss on the response, uncomment the lines 291, 292 and 301 and run the same command with proper conversational dataset path - ```python - ```python - response_template = "#RESPONSE\n" - collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) - - # Initialize trainer - trainer = SFTTrainer( - model=model, - processing_class=tokenizer, - train_dataset=dataset, - args=training_config, - callbacks=[Callback(flush_steps=1)], - data_collator=collator, # pass the collator in the trainer - ) - ``` \ No newline at end of file +- set the `zero_optimization-stage` to `3` and `overlap_comm` to `false` in `ds_config` for better memory optimizations +- Add the respecitive variables like `MODEL_PATH`, `TRAIN_DATA`, `OUTPUT_DIR` etc. in the `run.sh` script and run +```bash +bash ./selekt/run.sh \ No newline at end of file diff --git a/src/train/SeleKT/run.sh b/src/train/SeleKT/run.sh index e69de29..5faa5a5 100644 --- a/src/train/SeleKT/run.sh +++ b/src/train/SeleKT/run.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +export MODEL_NAME="" +export DESC="" + +# Stage 1: Instruction Training +OUTPUT_DIR_STAGE1="./output/selekt_stage1_instruction" +TRAIN_DATA_STAGE1="" +MODEL_PATH="" + +# Stage 2: Conversational Training +OUTPUT_DIR_STAGE2="./output/selekt_stage2_conversational" +TRAIN_DATA_STAGE2="" + +find_latest_checkpoint() { + local output_dir=$1 + local latest_checkpoint=$(find "$output_dir" -name "checkpoint-*" -type d | sort -V | tail -1) + echo "$latest_checkpoint" +} + +echo "Starting Stage 1: SeleKT Instruction Training..." +echo "Model: $MODEL_PATH" +echo "Training data: $TRAIN_DATA_STAGE1" +echo "Output directory: $OUTPUT_DIR_STAGE1" + +mkdir -p $OUTPUT_DIR_STAGE1 + +# Stage 1: Instruction Training +accelerate launch \ + --config_file=../configs/general_acc.yaml \ + selekt.py \ + --model_name_or_path "$MODEL_PATH" \ + --train_data_path "$TRAIN_DATA_STAGE1" \ + --output_dir ${OUTPUT_DIR_STAGE1} \ + --num_train_epochs 3 \ + --model_max_length 16384 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --save_strategy "epoch" \ + --save_steps 760 \ + --save_total_limit 25 \ + --learning_rate 1e-5 \ + --warmup_ratio 0.1 \ + --weight_decay 0.1 \ + --logging_steps 5 \ + --lr_scheduler_type "cosine" \ + --report_to "wandb" \ + --gradient_checkpointing True \ + --deepspeed ../configs/ds_config.json \ + --bf16 True \ + --run_name "${MODEL_NAME}_stage1_instruction" \ + --alpha 0.05 \ + +if [ $? -ne 0 ]; then + echo "Error: Stage 1 training failed!" + exit 1 +fi + +echo "Stage 1 completed successfully!" + +LATEST_CHECKPOINT=$(find_latest_checkpoint "$OUTPUT_DIR_STAGE1") + +if [ -z "$LATEST_CHECKPOINT" ]; then + echo "Error: No checkpoint found in $OUTPUT_DIR_STAGE1" + exit 1 +fi + +echo "Found latest checkpoint: $LATEST_CHECKPOINT" +echo "Starting Stage 2: SeleKT Conversational Training..." +echo "Model: $LATEST_CHECKPOINT" +echo "Training data: $TRAIN_DATA_STAGE2" +echo "Output directory: $OUTPUT_DIR_STAGE2" + +mkdir -p $OUTPUT_DIR_STAGE2 + +# Stage 2: Conversational Training +accelerate launch \ + --config_file=../configs/general_acc.yaml \ + selekt.py \ + --model_name_or_path "${LATEST_CHECKPOINT}" \ + --train_data_path "$TRAIN_DATA_STAGE2" \ + --output_dir ${OUTPUT_DIR_STAGE2} \ + --num_train_epochs 3 \ + --model_max_length 16384 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --save_strategy "epoch" \ + --save_steps 760 \ + --save_total_limit 25 \ + --learning_rate 1e-5 \ + --warmup_ratio 0.1 \ + --weight_decay 0.1 \ + --logging_steps 5 \ + --lr_scheduler_type "cosine" \ + --report_to "wandb" \ + --gradient_checkpointing True \ + --deepspeed ../configs/ds_config.json \ + --bf16 True \ + --run_name "${MODEL_NAME}_stage2_conversational" \ + --alpha 0.05 \ + --is_conversational_training \ + + +# Check if stage 2 completed successfully +if [ $? -ne 0 ]; then + echo "Error: Stage 2 training failed!" + exit 1 +fi + +echo "Stage 2 training completed!" +echo "Both training stages completed successfully!" +echo "Final model saved in: $OUTPUT_DIR_STAGE2" \ No newline at end of file diff --git a/src/train/SeleKT/selekt.py b/src/train/SeleKT/selekt.py index 76936d9..45a16c4 100644 --- a/src/train/SeleKT/selekt.py +++ b/src/train/SeleKT/selekt.py @@ -70,10 +70,11 @@ def parse_args(): help="Whether to use bf16 mixed precision training") parser.add_argument("--run_name", type=str, default=None) parser.add_argument("--use_liger", type=bool, default=False) - parser.add_argument("--debug", type=bool, default=False) parser.add_argument("--packing", type=bool, default=True, help="Whether to use packing for training") - parser.add_argument("--alpha", type=float, default=0.05,) + parser.add_argument("--alpha", type=float, default=0.05, help="Alpha value for SeleKT") + parser.add_argument("--is_conversational_training", action='store_true', + help="Whether to use conversational training format") args, _ = parser.parse_known_args() return args diff --git a/src/train/lora/lora.py b/src/train/lora/lora.py index 01fcd6a..6385456 100644 --- a/src/train/lora/lora.py +++ b/src/train/lora/lora.py @@ -66,9 +66,10 @@ def parse_args(): help="Whether to use bf16 mixed precision training") parser.add_argument("--run_name", type=str, default=None) parser.add_argument("--use_liger", type=bool, default=False) - parser.add_argument("--debug", type=bool, default=False) parser.add_argument("--packing", type=bool, default=True, help="Whether to use packing for training") + parser.add_argument("--is_conversational_training", action='store_true', + help="Whether to use conversational training format") args, _ = parser.parse_known_args() return args @@ -151,12 +152,13 @@ def main(): output_dir=args.output_dir, report_to="none", gradient_checkpointing=args.gradient_checkpointing, - gradient_checkpointing_kwargs={"use_reentrant": False}, + gradient_checkpointing_kwargs={"use_reentrant": True}, deepspeed=args.deepspeed, dataset_num_proc=80, run_name=args.run_name, use_liger=args.use_liger, ) + lora_config = LoraConfig( r=64, # target_modules= ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], @@ -175,6 +177,11 @@ def main(): dataset = setup_training_data(args, local_rank, tokenizer) + collator = None + if args.is_conversational_training: + response_template = "#RESPONSE\n" + collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) + trainer = SFTTrainer( model=model, processing_class=tokenizer, @@ -182,6 +189,7 @@ def main(): args=training_config, peft_config=lora_config, callbacks=[Callback(flush_steps=1)], + data_collator=collator ) print("Starting LoRA training...") diff --git a/src/train/lora/merge_lora.py b/src/train/lora/merge_lora.py index f2c16dd..de4927b 100644 --- a/src/train/lora/merge_lora.py +++ b/src/train/lora/merge_lora.py @@ -1,16 +1,73 @@ +#!/usr/bin/env python3 + +import argparse +import torch from peft import AutoPeftModelForCausalLM from transformers import AutoTokenizer +import os -checkpoints = [] # add the paths to the checkpoints here +def parse_args(): + parser = argparse.ArgumentParser(description="Merge LoRA weights with base model") + parser.add_argument("--lora_checkpoint", type=str, required=True, + help="Path to the LoRA checkpoint directory") + parser.add_argument("--output_dir", type=str, required=True, + help="Directory to save the merged model") + parser.add_argument("--max_shard_size", type=str, default="5GB", + help="Maximum size of each shard when saving") + parser.add_argument("--safe_serialization", action="store_true", default=True, + help="Use safe serialization format") + return parser.parse_args() +def merge_lora_weights(lora_checkpoint, output_dir, max_shard_size="5GB", safe_serialization=True): + """ + Merge LoRA adapter weights with the base model + """ + print(f"Loading LoRA model from: {lora_checkpoint}") + + peft_model = AutoPeftModelForCausalLM.from_pretrained( + lora_checkpoint, + torch_dtype=torch.bfloat16, + device_map="auto" + ) + + print(f"Loading tokenizer from: {lora_checkpoint}") + tokenizer = AutoTokenizer.from_pretrained(lora_checkpoint) + + print("Merging LoRA adapters with base model...") + merged_model = peft_model.merge_and_unload() + + print(f"Saving merged model to: {output_dir}") + os.makedirs(output_dir, exist_ok=True) + + merged_model.save_pretrained( + output_dir, + max_shard_size=max_shard_size, + safe_serialization=safe_serialization + ) + + # Save the tokenizer + tokenizer.save_pretrained(output_dir) + + print(f"✅ Successfully merged and saved model to: {output_dir}") + + del peft_model, merged_model + torch.cuda.empty_cache() + + return output_dir -for lora_checkpoint in checkpoints[1:]: - peft_model = AutoPeftModelForCausalLM.from_pretrained(lora_checkpoint) - tokenizer = AutoTokenizer.from_pretrained(lora_checkpoint) +def main(): + args = parse_args() + + try: + merge_lora_weights( + lora_checkpoint=args.lora_checkpoint, + output_dir=args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization + ) + except Exception as e: + print(f"❌ Error during merging: {str(e)}") + raise e - merged_model = peft_model.merge_and_unload() - print(type(merged_model)) - output_path = lora_checkpoint + "-merged" - merged_model.save_pretrained(output_path) - tokenizer.save_pretrained(output_path) - print(f"Model saved at {output_path}") +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/train/lora/run.sh b/src/train/lora/run.sh index e69de29..2607b44 100644 --- a/src/train/lora/run.sh +++ b/src/train/lora/run.sh @@ -0,0 +1,154 @@ +#!/bin/bash + +export MODEL_NAME="" +export DESC="" + +# Stage 1: Instruction Training +OUTPUT_DIR_STAGE1="./output/stage1_instruction_lora" +TRAIN_DATA_STAGE1="" +MODEL_PATH="" + +# Stage 2: Conversational Training +OUTPUT_DIR_STAGE2="./output/stage2_conversational_lora" +TRAIN_DATA_STAGE2="" + +# Merged model directory +MERGED_MODEL_DIR="./output/stage1_merged" + +find_latest_checkpoint() { + local output_dir=$1 + local latest_checkpoint=$(find "$output_dir" -name "checkpoint-*" -type d | sort -V | tail -1) + echo "$latest_checkpoint" +} + +merge_lora_weights() { + local lora_checkpoint=$1 + local output_dir=$2 + + echo "Merging LoRA weights..." + echo "LoRA checkpoint: $lora_checkpoint" + echo "Output: $output_dir" + + python3 merge_lora.py \ + --lora_checkpoint "$lora_checkpoint" \ + --output_dir "$output_dir" \ + --safe_serialization + + return $? +} + +echo "Starting Stage 1: Instruction Training (LoRA)..." +echo "Model: $MODEL_PATH" +echo "Training data: $TRAIN_DATA_STAGE1" +echo "Output directory: $OUTPUT_DIR_STAGE1" + +mkdir -p $OUTPUT_DIR_STAGE1 + +# Stage 1: LoRA Instruction Training +accelerate launch \ + --config_file=../configs/general_acc.yaml \ + lora.py \ + --model_name_or_path "$MODEL_PATH" \ + --train_data_path "$TRAIN_DATA_STAGE1" \ + --output_dir ${OUTPUT_DIR_STAGE1} \ + --num_train_epochs 3 \ + --model_max_length 16384 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --save_strategy "epoch" \ + --save_steps 760 \ + --save_total_limit 25 \ + --learning_rate 1e-5 \ + --warmup_ratio 0.1 \ + --weight_decay 0.1 \ + --logging_steps 5 \ + --lr_scheduler_type "cosine" \ + --report_to "wandb" \ + --gradient_checkpointing True \ + --deepspeed ../configs/ds_config.json \ + --bf16 True \ + --run_name "${MODEL_NAME}_stage1_instruction_lora" \ + +if [ $? -ne 0 ]; then + echo "Error: Stage 1 training failed!" + exit 1 +fi + +echo "Stage 1 completed successfully!" + +# Find latest checkpoint +LATEST_CHECKPOINT=$(find_latest_checkpoint "$OUTPUT_DIR_STAGE1") + +if [ -z "$LATEST_CHECKPOINT" ]; then + echo "Error: No checkpoint found in $OUTPUT_DIR_STAGE1" + exit 1 +fi + +echo "Found latest checkpoint: $LATEST_CHECKPOINT" + +# Merge LoRA weights with base model +mkdir -p $MERGED_MODEL_DIR +merge_lora_weights "$LATEST_CHECKPOINT" "$MERGED_MODEL_DIR" + +if [ $? -ne 0 ]; then + echo "Error: LoRA merging failed!" + exit 1 +fi + +echo "LoRA weights merged successfully!" +echo "Starting Stage 2: Conversational Training (LoRA)..." +echo "Model: $MERGED_MODEL_DIR" +echo "Training data: $TRAIN_DATA_STAGE2" +echo "Output directory: $OUTPUT_DIR_STAGE2" + +mkdir -p $OUTPUT_DIR_STAGE2 + +# Stage 2: LoRA Conversational Training +accelerate launch \ + --config_file=../configs/general_acc.yaml \ + lora.py \ + --model_name_or_path "${MERGED_MODEL_DIR}" \ + --train_data_path "$TRAIN_DATA_STAGE2" \ + --output_dir ${OUTPUT_DIR_STAGE2} \ + --num_train_epochs 3 \ + --model_max_length 16384 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --save_strategy "epoch" \ + --save_steps 760 \ + --save_total_limit 25 \ + --learning_rate 1e-5 \ + --warmup_ratio 0.1 \ + --weight_decay 0.1 \ + --logging_steps 5 \ + --lr_scheduler_type "cosine" \ + --report_to "wandb" \ + --gradient_checkpointing True \ + --deepspeed ../configs/ds_config.json \ + --bf16 True \ + --run_name "${MODEL_NAME}_stage2_conversational_lora" \ + --is_conversational_training \ + +if [ $? -ne 0 ]; then + echo "Error: Stage 2 training failed!" + exit 1 +fi + +echo "Stage 2 training completed successfully!" + +# Find final checkpoint and merge again +FINAL_CHECKPOINT=$(find_latest_checkpoint "$OUTPUT_DIR_STAGE2") +FINAL_MERGED_DIR="./output/final_merged_model" + +if [ ! -z "$FINAL_CHECKPOINT" ]; then + echo "Merging final LoRA weights..." + mkdir -p $FINAL_MERGED_DIR + merge_lora_weights "$FINAL_CHECKPOINT" "$FINAL_MERGED_DIR" + echo "Final merged model saved in: $FINAL_MERGED_DIR" +else + echo "Warning: No final checkpoint found, using stage 2 output directory" +fi + +echo "Both training stages completed successfully!" +echo "LoRA adapters saved in: $OUTPUT_DIR_STAGE2" +echo "Final merged model saved in: $FINAL_MERGED_DIR" \ No newline at end of file diff --git a/src/train/sft/run.sh b/src/train/sft/run.sh index 54d7a04..e534fe0 100644 --- a/src/train/sft/run.sh +++ b/src/train/sft/run.sh @@ -1,21 +1,37 @@ #!/bin/bash - export MODEL_NAME="" export DESC="" -OUTPUT_DIR="" -TRAIN_DATA="" +# Stage 1: Instruction Training +OUTPUT_DIR_STAGE1="./output/sft_stage1_instruction" +TRAIN_DATA_STAGE1="" MODEL_PATH="" -mkdir -p $OUTPUT_DIR +# Stage 2: Conversational Training +OUTPUT_DIR_STAGE2="./output/sft_stage2_conversational" +TRAIN_DATA_STAGE2="" + +find_latest_checkpoint() { + local output_dir=$1 + local latest_checkpoint=$(find "$output_dir" -name "checkpoint-*" -type d | sort -V | tail -1) + echo "$latest_checkpoint" +} + +echo "Starting Stage 1: Instruction Training..." +echo "Model: $MODEL_PATH" +echo "Training data: $TRAIN_DATA_STAGE1" +echo "Output directory: $OUTPUT_DIR_STAGE1" +mkdir -p $OUTPUT_DIR_STAGE1 + +# Stage 1: Instruction Training accelerate launch \ --config_file=../configs/general_acc.yaml \ sft.py \ --model_name_or_path "$MODEL_PATH" \ - --train_data_path "$TRAIN_DATA" \ - --output_dir ${OUTPUT_DIR} \ + --train_data_path "$TRAIN_DATA_STAGE1" \ + --output_dir ${OUTPUT_DIR_STAGE1} \ --num_train_epochs 3 \ --model_max_length 16384 \ --per_device_train_batch_size 1 \ @@ -32,16 +48,37 @@ accelerate launch \ --gradient_checkpointing True \ --deepspeed ../configs/ds_config.json \ --bf16 True \ - --run_name "" \ + --run_name "${MODEL_NAME}_stage1_instruction" \ + +if [ $? -ne 0 ]; then + echo "Error: Stage 1 training failed!" + exit 1 +fi +echo "Stage 1 completed successfully!" +LATEST_CHECKPOINT=$(find_latest_checkpoint "$OUTPUT_DIR_STAGE1") +if [ -z "$LATEST_CHECKPOINT" ]; then + echo "Error: No checkpoint found in $OUTPUT_DIR_STAGE1" + exit 1 +fi + +echo "Found latest checkpoint: $LATEST_CHECKPOINT" +echo "Starting Stage 2: Conversational Training..." +echo "Model: $LATEST_CHECKPOINT" +echo "Training data: $TRAIN_DATA_STAGE2" +echo "Output directory: $OUTPUT_DIR_STAGE2" + +mkdir -p $OUTPUT_DIR_STAGE2 + +# Stage 2: Conversational Training accelerate launch \ --config_file=../configs/general_acc.yaml \ sft.py \ - --model_name_or_path "${MODEL_PATH}" \ - --train_data_path "$TRAIN_DATA" \ - --output_dir ${OUTPUT_DIR} \ + --model_name_or_path "${LATEST_CHECKPOINT}" \ + --train_data_path "$TRAIN_DATA_STAGE2" \ + --output_dir ${OUTPUT_DIR_STAGE2} \ --num_train_epochs 3 \ --model_max_length 16384 \ --per_device_train_batch_size 1 \ @@ -58,6 +95,16 @@ accelerate launch \ --gradient_checkpointing True \ --deepspeed ../configs/ds_config.json \ --bf16 True \ - --run_name "" \ + --run_name "${MODEL_NAME}_stage2_conversational" \ --is_conversational_training \ + +# Check if stage 2 completed successfully +if [ $? -ne 0 ]; then + echo "Error: Stage 2 training failed!" + exit 1 +fi + +echo "Stage 2 training completed!" +echo "Both training stages completed successfully!" +echo "Final model saved in: $OUTPUT_DIR_STAGE2" \ No newline at end of file diff --git a/src/train/sft/sft.py b/src/train/sft/sft.py index 0b3f17f..08a4abd 100644 --- a/src/train/sft/sft.py +++ b/src/train/sft/sft.py @@ -62,10 +62,9 @@ def parse_args(): help="Whether to use bf16 mixed precision training") parser.add_argument("--run_name", type=str, default=None) parser.add_argument("--use_liger", type=bool, default=False) - parser.add_argument("--debug", type=bool, default=False) parser.add_argument("--packing", type=bool, default=True, help="Whether to use packing for training") - parser.add_argument("--is_conversational_training", type=bool, action='store_true', + parser.add_argument("--is_conversational_training", action='store_true', help="Whether to use conversational training format") args, _ = parser.parse_known_args() From caf429ff82db7915b7bbf7f8d81ae821566cba5d Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Tue, 10 Jun 2025 09:13:42 +0000 Subject: [PATCH 5/7] adding contribution.md file --- CONTRIBUTING.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..c282e9a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,14 @@ +# Contributing + +This project welcomes contributions and suggestions. Most contributions require you to +agree to a Contributor License Agreement (CLA) declaring that you have the right to, +and actually do, grant us the rights to use your contribution. For details, visit +https://cla.microsoft.com. + +When you submit a pull request, a CLA-bot will automatically determine whether you need +to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the +instructions provided by the bot. You will only need to do this once across all repositories using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. \ No newline at end of file From 6f15dc73a79cc3c763e5826369309236dfa278fd Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Tue, 10 Jun 2025 09:19:43 +0000 Subject: [PATCH 6/7] added citations --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 086a60f..66938d8 100644 --- a/README.md +++ b/README.md @@ -112,3 +112,17 @@ trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. + +## Citation + +```bibtex +@misc{aggarwal2025robustlearningdiversecode, + title={Robust Learning of Diverse Code Edits}, + author={Tushar Aggarwal and Swayam Singh and Abhijeet Awasthi and Aditya Kanade and Nagarajan Natarajan}, + year={2025}, + eprint={2503.03656}, + archivePrefix={arXiv}, + primaryClass={cs.SE}, + url={https://arxiv.org/abs/2503.03656}, +} +``` \ No newline at end of file From 8523d5fbdf337922a789ef90c29d0db8d90dda2b Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Tue, 10 Jun 2025 09:34:13 +0000 Subject: [PATCH 7/7] updating citation --- README.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 66938d8..f80411b 100644 --- a/README.md +++ b/README.md @@ -116,13 +116,11 @@ Any use of third-party trademarks or logos are subject to those third-party's po ## Citation ```bibtex -@misc{aggarwal2025robustlearningdiversecode, - title={Robust Learning of Diverse Code Edits}, - author={Tushar Aggarwal and Swayam Singh and Abhijeet Awasthi and Aditya Kanade and Nagarajan Natarajan}, - year={2025}, - eprint={2503.03656}, - archivePrefix={arXiv}, - primaryClass={cs.SE}, - url={https://arxiv.org/abs/2503.03656}, +@inproceedings{aggarwal2025nextcoder, +author = {Aggarwal, Tushar and Singh, Swayam and Awasthi, Abhijeet and Kanade, Aditya and Natarajan, Nagarajan}, +title = {NextCoder: Robust Adaptation of Code LMs to Diverse Code Edits}, +booktitle = {International Conference on Machine Learning}, +year = {2025}, +url = {https://www.microsoft.com/en-us/research/publication/nextcoder-robust-adaptation-of-code-lms-to-diverse-code-edits/}, } ``` \ No newline at end of file