microsoft · adityakanade · Jun 10, 2025 · Jun 10, 2025 · Jun 10, 2025 · Jun 10, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,5 @@
 *.ipynb
-*.parquet
+*.parquet
+dataset/
+models/
+local_util/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,14 @@
+# Contributing
+
+This project welcomes contributions and suggestions. Most contributions require you to
+agree to a Contributor License Agreement (CLA) declaring that you have the right to,
+and actually do, grant us the rights to use your contribution. For details, visit
+https://cla.microsoft.com.
+
+When you submit a pull request, a CLA-bot will automatically determine whether you need
+to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
+instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+or contact [[email protected]](mailto:[email protected]) with any additional questions or comments.
diff --git a/README.md b/README.md
@@ -112,3 +112,15 @@ trademarks or logos is subject to and must follow
 [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
 Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
 Any use of third-party trademarks or logos are subject to those third-party's policies.
+
+## Citation
+
+```bibtex
+@inproceedings{aggarwal2025nextcoder,
+author = {Aggarwal, Tushar and Singh, Swayam and Awasthi, Abhijeet and Kanade, Aditya and Natarajan, Nagarajan},
+title = {NextCoder: Robust Adaptation of Code LMs to Diverse Code Edits},
+booktitle = {International Conference on Machine Learning},
+year = {2025},
+url = {https://www.microsoft.com/en-us/research/publication/nextcoder-robust-adaptation-of-code-lms-to-diverse-code-edits/},
+}
+```
diff --git a/src/train/README.md b/src/train/README.md
@@ -1,12 +1,10 @@
 # Model Training scripts
 
 ## Folder Structure
-- `ds_config.json` contains the deepspeed configuration
-- `general_acc.yaml` contains the accelerate configuration (might need to be modified as per desired system)
-- `lora.py` contains the code for training model with LoRA
-- `merge_lora.py` contains the code for merging trained LoRA adapters back to model for inference
-- `seletkt.py` contains the code for training model with our algorithm explained in our paper
-- `sft.py` contains the code for training model with Full Supervised Finetuning
+- `configs` contains the deepspeed and accelerate configurations (modifialbe as per the system)
+- `lora` contains the code for training model with LoRA
+- `seletkt` contains the code for training model with SeleKT algorithm explained in our paper
+- `sft` contains the code for training model with Full Supervised Finetuning
 
 ## Usgae
 ### Preparing the dataset
@@ -23,122 +21,23 @@
 ### Training with SFT
 - modify or replace the `general_acc.yaml` file as per the desired system configuration
 - set the `zero_optimization-stage` to `3` and `overlap_comm` to `false` in `ds_config` for better memory optimizations
-- Run the following command to start training
-  ```bash
-  deepspeed sft.py \
-      --model_name_or_path "path to pretrained LLM" \
-      --train_data_path "path to training data" \
-      --output_dir "path to output dir" \
-      --num_train_epochs 3 \
-      --model_max_length 8192 \
-      --per_device_train_batch_size 4 \
-      --gradient_accumulation_steps 4 \
-      --save_strategy "epoch" \
-      --save_steps 760 \
-      --save_total_limit 25 \
-      --learning_rate 1e-5 \
-      --warmup_ratio 0.1 \
-      --logging_steps 5 \
-      --report_to "wandb" \
-      --gradient_checkpointing True \
-      --deepspeed ds_config.json \
-      --bf16 True \
-      --run_name "Run name for logs" \
-      --debug True \
-  ```
-  Update the above command as per the model
-- To train on conversation data by only applying loss on the response, uncomment the lines 175, 176 and 185 and run the same command with proper conversational dataset path
-  ```python
-    response_template = "#RESPONSE\n"
-    collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
-
-    # Initialize trainer
-    trainer = SFTTrainer(
-        model=model,
-        processing_class=tokenizer,
-        train_dataset=dataset,
-        args=training_config,
-        callbacks=[Callback(flush_steps=1)],
-        data_collator=collator, # pass the collator in the trainer
-    )
-  ```
+- Add the respecitive variables like `MODEL_PATH`, `TRAIN_DATA`, `OUTPUT_DIR` etc. in the `run.sh` script and run
+```bash
+bash ./sft/run.sh
+```
 
 ### Training with LoRA
 - modify or replace the `general_acc.yaml` file as per the desired system configuration
-- set the `zero_optimization-stage` to `2` and `overlap_comm` to `false` in `ds_config` for better memory optimizations
-- Run the following command to start training
-  ```bash
-  deepspeed lora.py \
-      --model_name_or_path "path to pretrained LLM" \
-      --train_data_path "path to training data" \
-      --output_dir "path to output dir" \
-      --num_train_epochs 3 \
-      --model_max_length 8192 \
-      --per_device_train_batch_size 4 \
-      --gradient_accumulation_steps 4 \
-      --save_strategy "epoch" \
-      --save_steps 760 \
-      --save_total_limit 25 \
-      --learning_rate 1e-5 \
-      --warmup_ratio 0.1 \
-      --logging_steps 5 \
-      --report_to "wandb" \
-      --gradient_checkpointing True \
-      --deepspeed ds_config.json \
-      --bf16 True \
-      --run_name "Run name for logs" \
-      --debug True \
-  ```
-  Update the above command as per the model
-- Put the path of output LoRA adapters inside `merge_lora.py` and run following to get the final checkpoints
-  ```bash
-  python merge_lora.py
-  ```
+- set the `zero_optimization-stage` to `2` and `overlap_comm` to `false` in `ds_config`
+- Add the respecitive variables like `MODEL_PATH`, `TRAIN_DATA`, `OUTPUT_DIR` etc. in the `run.sh` script and run
+```bash
+bash ./lora/run.sh
+```
+>`lora/lora.py` uses `use_reentrant: True` for gradient checkpointing, and this can allow using deepspeed zero-3 optimization for large models.
 
 ### Training with SeleKT
 - modify or replace the `general_acc.yaml` file as per the desired system configuration
-- set the `zero_optimization-stage` to `2` and `overlap_comm` to `false` in `ds_config` for better memory optimizations
-- Run the following command to start training
-  ```bash
-  accelerate launch \
-      --config_file=general_acc.yaml \
-      selekt.py \
-      --model_name_or_path "path to pretrained LLM" \
-      --base_model_path "path to pretrained LLM" \
-      --train_data_path "path to training data" \
-      --output_dir "path to output directory" \
-      --num_train_epochs 3 \
-      --model_max_length 8192 \
-      --per_device_train_batch_size 4 \
-      --gradient_accumulation_steps 4 \
-      --save_strategy "steps" \
-      --save_steps "Enter the periodicity value M for seleKT"  \
-      --save_total_limit 50 \
-      --learning_rate 1e-5 \
-      --warmup_ratio 0.1 \
-      --logging_steps 5 \
-      --report_to "wandb" \
-      --gradient_checkpointing True \
-      --deepspeed ds_config.json \
-      --bf16 True \
-      --run_name "Name for logs" \
-      --debug True \
-      --alpha "Enter value for desired alpha parameter for SeleKT" \
-  ```
-  Update the above command as per the model
-- To train on conversation data by only applying loss on the response, uncomment the lines 291, 292 and 301 and run the same command with proper conversational dataset path
-  ```python
-    ```python
-    response_template = "#RESPONSE\n"
-    collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
-
-    # Initialize trainer
-    trainer = SFTTrainer(
-        model=model,
-        processing_class=tokenizer,
-        train_dataset=dataset,
-        args=training_config,
-        callbacks=[Callback(flush_steps=1)],
-        data_collator=collator, # pass the collator in the trainer
-    )
-    ```
+- set the `zero_optimization-stage` to `3` and `overlap_comm` to `false` in `ds_config` for better memory optimizations
+- Add the respecitive variables like `MODEL_PATH`, `TRAIN_DATA`, `OUTPUT_DIR` etc. in the `run.sh` script and run
+```bash
+bash ./selekt/run.sh
diff --git a/src/train/SeleKT/run.sh b/src/train/SeleKT/run.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+export MODEL_NAME=""
+export DESC=""
+
+# Stage 1: Instruction Training
+OUTPUT_DIR_STAGE1="./output/selekt_stage1_instruction"
+TRAIN_DATA_STAGE1=""
+MODEL_PATH=""
+
+# Stage 2: Conversational Training
+OUTPUT_DIR_STAGE2="./output/selekt_stage2_conversational"
+TRAIN_DATA_STAGE2=""
+
+find_latest_checkpoint() {
+    local output_dir=$1
+    local latest_checkpoint=$(find "$output_dir" -name "checkpoint-*" -type d | sort -V | tail -1)
+    echo "$latest_checkpoint"
+}
+
+echo "Starting Stage 1: SeleKT Instruction Training..."
+echo "Model: $MODEL_PATH"
+echo "Training data: $TRAIN_DATA_STAGE1"
+echo "Output directory: $OUTPUT_DIR_STAGE1"
+
+mkdir -p $OUTPUT_DIR_STAGE1
+
+# Stage 1: Instruction Training
+accelerate launch \
+      --config_file=../configs/general_acc.yaml \
+      selekt.py \
+      --model_name_or_path "$MODEL_PATH" \
+      --train_data_path "$TRAIN_DATA_STAGE1" \
+      --output_dir ${OUTPUT_DIR_STAGE1} \
+      --num_train_epochs 3 \
+      --model_max_length 16384 \
+      --per_device_train_batch_size 1 \
+      --gradient_accumulation_steps 4 \
+      --save_strategy "epoch" \
+      --save_steps 760 \
+      --save_total_limit 25 \
+      --learning_rate 1e-5 \
+      --warmup_ratio 0.1 \
+      --weight_decay 0.1 \
+      --logging_steps 5 \
+      --lr_scheduler_type "cosine" \
+      --report_to "wandb" \
+      --gradient_checkpointing True \
+      --deepspeed ../configs/ds_config.json \
+      --bf16 True \
+      --run_name "${MODEL_NAME}_stage1_instruction" \
+      --alpha 0.05 \
+
+if [ $? -ne 0 ]; then
+    echo "Error: Stage 1 training failed!"
+    exit 1
+fi
+
+echo "Stage 1 completed successfully!"
+
+LATEST_CHECKPOINT=$(find_latest_checkpoint "$OUTPUT_DIR_STAGE1")
+
+if [ -z "$LATEST_CHECKPOINT" ]; then
+    echo "Error: No checkpoint found in $OUTPUT_DIR_STAGE1"
+    exit 1
+fi
+
+echo "Found latest checkpoint: $LATEST_CHECKPOINT"
+echo "Starting Stage 2: SeleKT Conversational Training..."
+echo "Model: $LATEST_CHECKPOINT"
+echo "Training data: $TRAIN_DATA_STAGE2"
+echo "Output directory: $OUTPUT_DIR_STAGE2"
+
+mkdir -p $OUTPUT_DIR_STAGE2
+
+# Stage 2: Conversational Training
+accelerate launch \
+      --config_file=../configs/general_acc.yaml \
+      selekt.py \
+      --model_name_or_path "${LATEST_CHECKPOINT}" \
+      --train_data_path "$TRAIN_DATA_STAGE2" \
+      --output_dir ${OUTPUT_DIR_STAGE2} \
+      --num_train_epochs 3 \
+      --model_max_length 16384 \
+      --per_device_train_batch_size 1 \
+      --gradient_accumulation_steps 4 \
+      --save_strategy "epoch" \
+      --save_steps 760 \
+      --save_total_limit 25 \
+      --learning_rate 1e-5 \
+      --warmup_ratio 0.1 \
+      --weight_decay 0.1 \
+      --logging_steps 5 \
+      --lr_scheduler_type "cosine" \
+      --report_to "wandb" \
+      --gradient_checkpointing True \
+      --deepspeed ../configs/ds_config.json \
+      --bf16 True \
+      --run_name "${MODEL_NAME}_stage2_conversational" \
+      --alpha 0.05 \
+      --is_conversational_training \
+
+
+# Check if stage 2 completed successfully
+if [ $? -ne 0 ]; then
+    echo "Error: Stage 2 training failed!"
+    exit 1
+fi
+
+echo "Stage 2 training completed!"
+echo "Both training stages completed successfully!"
+echo "Final model saved in: $OUTPUT_DIR_STAGE2"
diff --git a/src/train/selekt.py → src/train/SeleKT/selekt.py b/src/train/selekt.py → src/train/SeleKT/selekt.py
@@ -70,10 +70,11 @@ def parse_args():
                       help="Whether to use bf16 mixed precision training")
     parser.add_argument("--run_name", type=str, default=None)
     parser.add_argument("--use_liger", type=bool, default=False)
-    parser.add_argument("--debug", type=bool, default=False)
     parser.add_argument("--packing", type=bool, default=True,
                       help="Whether to use packing for training")
-    parser.add_argument("--alpha", type=float, default=0.05,)
+    parser.add_argument("--alpha", type=float, default=0.05, help="Alpha value for SeleKT")
+    parser.add_argument("--is_conversational_training", action='store_true',
+                      help="Whether to use conversational training format")
 
     args, _ = parser.parse_known_args()
     return args
@@ -300,8 +301,10 @@ def train(args):
         print(f'Resuming from checkpoint: {last_checkpoint}')
 
 
-    # response_template = "#RESPONSE\n"
-    # collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
+    collator = None
+    if args.is_conversational_training:
+      response_template = "#RESPONSE\n"
+      collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
 
     callback = Callback(base_model_path=args.base_model_path, flush_steps=1, alpha=args.alpha)
     trainer = SFTTrainer(
@@ -310,7 +313,7 @@ def train(args):
         train_dataset=dataset,
         args=training_config,
         callbacks=[callback],
-        # data_collator=collator,
+        data_collator=collator,
     )
     callback.set_trainer(trainer)
     print(f"Starting training for epoch {args.num_train_epochs}")

diff --git a/src/train/ds_config.json → src/train/configs/ds_config.json b/src/train/ds_config.json → src/train/configs/ds_config.json
diff --git a/src/train/general_acc.yaml → src/train/configs/general_acc.yaml b/src/train/general_acc.yaml → src/train/configs/general_acc.yaml
diff --git a/src/train/lora.py → src/train/lora/lora.py b/src/train/lora.py → src/train/lora/lora.py
@@ -66,9 +66,10 @@ def parse_args():
                       help="Whether to use bf16 mixed precision training")
     parser.add_argument("--run_name", type=str, default=None)
     parser.add_argument("--use_liger", type=bool, default=False)
-    parser.add_argument("--debug", type=bool, default=False)
     parser.add_argument("--packing", type=bool, default=True,
                       help="Whether to use packing for training")
+    parser.add_argument("--is_conversational_training", action='store_true',
+                      help="Whether to use conversational training format")
 
     args, _ = parser.parse_known_args()
     return args
@@ -151,12 +152,13 @@ def main():
         output_dir=args.output_dir,
         report_to="none",
         gradient_checkpointing=args.gradient_checkpointing,
-        gradient_checkpointing_kwargs={"use_reentrant": False},
+        gradient_checkpointing_kwargs={"use_reentrant": True},
         deepspeed=args.deepspeed,
         dataset_num_proc=80,
         run_name=args.run_name,
         use_liger=args.use_liger,
         )
+
     lora_config = LoraConfig(
         r=64,
         # target_modules= ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
@@ -175,13 +177,19 @@ def main():
 
     dataset = setup_training_data(args, local_rank, tokenizer)
 
+    collator = None
+    if args.is_conversational_training:
+      response_template = "#RESPONSE\n"
+      collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
+
     trainer = SFTTrainer(
         model=model,
         processing_class=tokenizer,
         train_dataset=dataset,
         args=training_config,
         peft_config=lora_config,
         callbacks=[Callback(flush_steps=1)],
+        data_collator=collator
     )
 
     print("Starting LoRA training...")