diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py index ea612bf381..0be74081ca 100644 --- a/tests/recipes/test_eleuther_eval.py +++ b/tests/recipes/test_eleuther_eval.py @@ -15,11 +15,10 @@ from tests.common import TUNE_PATH from tests.recipes.utils import ( llama3_2_vision_test_config, - llama3_test_config, - write_hf_ckpt_config, + MODEL_TEST_CONFIGS, write_hf_vision_ckpt_config, ) -from tests.test_utils import CKPT_MODEL_PATHS, gpu_test +from tests.test_utils import CKPT_MODEL_PATHS, gpu_test, TOKENIZER_PATHS class TestEleutherEval: @@ -48,20 +47,19 @@ def expected_vision_acc(self): } @pytest.mark.parametrize( - "eval_name, expected_acc, bsz", + "model_ckpt, eval_name, expected_acc, bsz", [ - ("truthfulqa_gen", 0.1818, 4), - ("truthfulqa_mc2", 0.3015, 4), + ("llama3_hf_138m", "truthfulqa_gen", 0.1818, 4), + ("llama3_hf_138m", "truthfulqa_mc2", 0.3015, 4), ], ) @pytest.mark.integration_test @gpu_test(gpu_count=1) def test_torchtune_checkpoint_eval_results( - self, caplog, monkeypatch, tmpdir, eval_name, expected_acc, bsz + self, caplog, monkeypatch, tmpdir, eval_name, expected_acc, bsz, model_ckpt ): - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) # explicitly setting limit to an odd number here to ensure generation tasks # work with KV-cacheing + bsz > 1 - we'll receive batches of size 4, 4, 3 @@ -69,13 +67,10 @@ def test_torchtune_checkpoint_eval_results( tune run eleuther_eval \ --config eleuther_evaluation \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - tokenizer._component_=torchtune.models.llama3.llama3_tokenizer \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}'\ tokenizer.prompt_template=null \ limit=11 \ dtype=fp32 \ @@ -83,7 +78,7 @@ def test_torchtune_checkpoint_eval_results( batch_size={bsz} \ """.split() - model_config = llama3_test_config() + model_config = MODEL_TEST_CONFIGS[model_ckpt] cmd = cmd + model_config monkeypatch.setattr(sys, "argv", cmd) @@ -108,28 +103,30 @@ def test_torchtune_checkpoint_eval_results( @pytest.mark.integration_test @pytest.mark.usefixtures("hide_correct_version_number") @gpu_test(gpu_count=1) - def test_eval_recipe_errors_without_lm_eval(self, monkeypatch, tmpdir): - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) + def test_eval_recipe_errors_without_lm_eval(self, monkeypatch, tmpdir, model_ckpt): + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) cmd = f""" tune run eleuther_eval \ --config eleuther_evaluation \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - tokenizer._component_=torchtune.models.llama3.llama3_tokenizer \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ limit=1 \ dtype=fp32 \ """.split() - model_config = llama3_test_config() + model_config = MODEL_TEST_CONFIGS[model_ckpt] cmd = cmd + model_config monkeypatch.setattr(sys, "argv", cmd) @@ -142,27 +139,26 @@ def test_eval_recipe_errors_without_lm_eval(self, monkeypatch, tmpdir): @pytest.mark.integration_test @gpu_test(gpu_count=1) + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) def test_eval_recipe_errors_with_quantization_hf_checkpointer( - self, monkeypatch, tmpdir + self, monkeypatch, tmpdir, model_ckpt ): - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent - - # Config file needed for model conversion. - write_hf_ckpt_config(ckpt_dir) + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) cmd = f""" tune run eleuther_eval \ --config eleuther_evaluation \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelHFCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - tokenizer._component_=torchtune.models.llama3.llama3_tokenizer \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ limit=1 \ dtype=fp32 \ @@ -170,7 +166,7 @@ def test_eval_recipe_errors_with_quantization_hf_checkpointer( quantizer.groupsize=256 \ """.split() - model_config = llama3_test_config() + model_config = MODEL_TEST_CONFIGS[model_ckpt] cmd = cmd + model_config monkeypatch.setattr(sys, "argv", cmd) @@ -183,22 +179,26 @@ def test_eval_recipe_errors_with_quantization_hf_checkpointer( @pytest.mark.integration_test @gpu_test(gpu_count=1) - def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir): - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) + def test_eval_recipe_errors_with_qat_quantizer( + self, monkeypatch, tmpdir, model_ckpt + ): + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) cmd = f""" tune run eleuther_eval \ --config eleuther_evaluation \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - tokenizer._component_=torchtune.models.llama3.llama3_tokenizer \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ limit=1 \ dtype=fp32 \ @@ -206,7 +206,7 @@ def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir): quantizer.groupsize=32\ """.split() - model_config = llama3_test_config() + model_config = MODEL_TEST_CONFIGS[model_ckpt] cmd = cmd + model_config monkeypatch.setattr(sys, "argv", cmd) @@ -223,6 +223,9 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) ckpt_dir = ckpt_path.parent + # Config file needed for model conversion. + write_hf_vision_ckpt_config(ckpt_dir) + cmd = f""" tune run eleuther_eval \ --config llama3_2_vision/11B_evaluation \ diff --git a/tests/recipes/test_full_dpo_distributed.py b/tests/recipes/test_full_dpo_distributed.py index f20533c934..4534695d05 100644 --- a/tests/recipes/test_full_dpo_distributed.py +++ b/tests/recipes/test_full_dpo_distributed.py @@ -11,11 +11,7 @@ import pytest import torch from tests.common import TUNE_PATH -from tests.recipes.utils import ( - dummy_stack_exchange_dataset_config, - MODEL_TEST_CONFIGS, - write_hf_ckpt_config, -) +from tests.recipes.utils import dummy_stack_exchange_dataset_config, MODEL_TEST_CONFIGS from tests.test_utils import ( CKPT_MODEL_PATHS, gen_log_file_name, @@ -48,8 +44,14 @@ def _get_test_config_overrides(self, dtype_str: str = "fp32", epochs: int = 2): ] + dummy_stack_exchange_dataset_config() @pytest.mark.integration_test + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) @gpu_test(gpu_count=2) - def test_training_state_on_resume(self, tmpdir, monkeypatch): + def test_training_state_on_resume(self, tmpdir, monkeypatch, model_ckpt): """Test whether the recipe state is correctly updated on resume. Since this is model agnostic, we should run this on the small model only. The test consists of three stages: @@ -58,37 +60,26 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): - Make sure final loss matches the expected value of a model successfully resumed from a ckpt """ - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - tokenizer_path = Path(TOKENIZER_PATHS["llama3"]) - - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) # Train for two epochs cmd_1 = f""" tune run --nnodes 1 --nproc_per_node 2 full_dpo_distributed \ --config llama3_1/8B_full_dpo \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - ref_checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ ref_checkpointer.checkpoint_dir='{ckpt_dir}' \ - ref_checkpointer.checkpoint_files=[{ckpt_path}]\ + ref_checkpointer.checkpoint_files=[model.safetensors]\ ref_checkpointer.output_dir={tmpdir} \ - ref_checkpointer.model_type=LLAMA3 \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ metric_logger.filename={log_file} \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3"] + model_config = MODEL_TEST_CONFIGS[model_ckpt] cmd_1 = cmd_1 + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd_1) runpy.run_path(TUNE_PATH, run_name="__main__") @@ -100,7 +91,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): ) # We rename the model and we want to resume from epoch 0 (which trained for 1 epoch) - ckpt_to_resume_from = "epoch_0/model-00001-of-00001.bin" + ckpt_to_resume_from = "epoch_0/model-00001-of-00001.safetensors" # Now we resume training from epoch 1 resumed_log_dir = (tmpdir / "resumed/").mkdir() @@ -109,16 +100,12 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): tune run --nnodes 1 --nproc_per_node 2 full_dpo_distributed \ --config llama3_1/8B_full_dpo \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ checkpointer.checkpoint_files=[{ckpt_to_resume_from}]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - ref_checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ ref_checkpointer.checkpoint_dir='{ckpt_dir}' \ - ref_checkpointer.checkpoint_files=[{ckpt_path}]\ + ref_checkpointer.checkpoint_files=[model.safetensors]\ ref_checkpointer.output_dir={tmpdir} \ - ref_checkpointer.model_type=LLAMA3 \ resume_from_checkpoint=True \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ @@ -135,44 +122,39 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): ) @pytest.mark.integration_test + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) @gpu_test(gpu_count=2) def test_training_state_on_resume_with_async_checkpointing( - self, tmpdir, monkeypatch + self, tmpdir, monkeypatch, model_ckpt ): """Same as above test but with async checkpointing.""" - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - tokenizer_path = Path(TOKENIZER_PATHS["llama3"]) - - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) # Train for two epochs cmd_1 = f""" tune run --nnodes 1 --nproc_per_node 2 full_dpo_distributed \ --config llama3_1/8B_full_dpo \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - ref_checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ ref_checkpointer.checkpoint_dir='{ckpt_dir}' \ - ref_checkpointer.checkpoint_files=[{ckpt_path}]\ + ref_checkpointer.checkpoint_files=[model.safetensors]\ ref_checkpointer.output_dir={tmpdir} \ - ref_checkpointer.model_type=LLAMA3 \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ metric_logger.filename={log_file} \ enable_async_checkpointing=True \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3"] + model_config = MODEL_TEST_CONFIGS[model_ckpt] cmd_1 = cmd_1 + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd_1) @@ -191,16 +173,12 @@ def test_training_state_on_resume_with_async_checkpointing( tune run --nnodes 1 --nproc_per_node 2 full_dpo_distributed \ --config llama3_1/8B_full_dpo \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - ref_checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ ref_checkpointer.checkpoint_dir='{ckpt_dir}' \ - ref_checkpointer.checkpoint_files=[{ckpt_path}]\ + ref_checkpointer.checkpoint_files=[model.safetensors]\ ref_checkpointer.output_dir={tmpdir} \ - ref_checkpointer.model_type=LLAMA3 \ resume_from_checkpoint=True \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ diff --git a/tests/recipes/test_knowledge_distillation_distributed.py b/tests/recipes/test_knowledge_distillation_distributed.py index c3fd00ca3d..e037ba6370 100644 --- a/tests/recipes/test_knowledge_distillation_distributed.py +++ b/tests/recipes/test_knowledge_distillation_distributed.py @@ -13,12 +13,7 @@ import torch from omegaconf import OmegaConf from tests.common import TUNE_PATH -from tests.recipes.utils import ( - CKPT_COMPONENT_MAP, - dummy_alpaca_dataset_config, - MODEL_TEST_CONFIGS, - write_hf_ckpt_config, -) +from tests.recipes.utils import dummy_alpaca_dataset_config, MODEL_TEST_CONFIGS from tests.test_utils import ( CKPT_MODEL_PATHS, gen_log_file_name, @@ -53,46 +48,49 @@ def _get_test_config_overrides(self, epochs: int = 2): "compile=False", ] + dummy_alpaca_dataset_config() - def _fetch_expected_loss_values(self, model_type): + def _fetch_expected_loss_values(self, model_ckpt): loss_values_map = { - "llama3": [ - 11.777642250061035, - 11.760451793670654, - 11.755887508392334, - 11.76237678527832, + "llama3_hf_138m": [ + # TODO + # 11.777642250061035, + # 11.760451793670654, + # 11.755887508392334, + # 11.76237678527832, ], } - return loss_values_map[model_type] + return loss_values_map[model_ckpt] @pytest.mark.integration_test @gpu_test(gpu_count=4) - def test_loss(self, tmpdir, monkeypatch): - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) + def test_loss(self, tmpdir, monkeypatch, model_ckpt): + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - tokenizer_path = Path(TOKENIZER_PATHS["llama3"]) cmd = f""" tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \ --config llama3_2/8B_to_1B_KD_lora_distributed \ output_dir={tmpdir} \ - checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}] \ + checkpointer.checkpoint_files=[model.safetensors] \ checkpointer.output_dir={tmpdir} \ - teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \ - teacher_checkpointer.checkpoint_files=[{ckpt_path}] \ + teacher_checkpointer.checkpoint_files=[model.safetensors] \ teacher_checkpointer.output_dir={tmpdir} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ metric_logger.filename={log_file} \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] teacher_config = [ - "teacher_" + config for config in MODEL_TEST_CONFIGS["llama3"] + "teacher_" + config for config in MODEL_TEST_CONFIGS[model_ckpt] ] cmd = cmd + self._get_test_config_overrides() + model_config + teacher_config @@ -103,15 +101,20 @@ def test_loss(self, tmpdir, monkeypatch): # only take the first loss num_losses = int(len(loss_values) / 4) # 2 steps per epoch, 2 epochs loss_values = loss_values[0::num_losses] - expected_loss_values = self._fetch_expected_loss_values("llama3") - + expected_loss_values = self._fetch_expected_loss_values(model_ckpt) torch.testing.assert_close( loss_values, expected_loss_values, rtol=1e-5, atol=1e-5 ) @pytest.mark.integration_test @gpu_test(gpu_count=4) - def test_training_state_on_resume(self, tmpdir, monkeypatch): + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) + def test_training_state_on_resume(self, tmpdir, monkeypatch, model_ckpt): """Test whether the recipe state is correctly updated on resume. Since this is model agnostic, we should run this on the small model only. The test consists of three stages: @@ -120,37 +123,28 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): - Make sure final loss matches the expected value of a model successfully resumed from a ckpt """ - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - tokenizer_path = Path(TOKENIZER_PATHS["llama3"]) - - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) # Train for two epochs cmd_1 = f""" tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \ --config llama3_2/8B_to_1B_KD_lora_distributed \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \ - teacher_checkpointer.checkpoint_files=[{ckpt_path}] \ + teacher_checkpointer.checkpoint_files=[model.safetensors] \ teacher_checkpointer.output_dir={tmpdir} \ tokenizer.path={tokenizer_path} \ tokenizer.prompt_template=null \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] teacher_config = [ - "teacher_" + config for config in MODEL_TEST_CONFIGS["llama3"] + "teacher_" + config for config in MODEL_TEST_CONFIGS[model_ckpt] ] cmd_1 = ( @@ -166,15 +160,13 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \ --config llama3_2/8B_to_1B_KD_lora_distributed \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ - checkpointer.checkpoint_dir={ckpt_dir} \ - checkpointer.checkpoint_files=[{ckpt_path}]\ - checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} - checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")} + checkpointer.checkpoint_dir='{ckpt_dir}' \ + checkpointer.checkpoint_files=[model.safetensors]\ + checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} \ + checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")} \ checkpointer.output_dir={tmpdir} \ - teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \ - teacher_checkpointer.checkpoint_files=[{ckpt_path}] \ + teacher_checkpointer.checkpoint_files=[model.safetensors] \ teacher_checkpointer.output_dir={tmpdir} \ resume_from_checkpoint=True \ metric_logger.filename={log_file} \ @@ -191,7 +183,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): runpy.run_path(TUNE_PATH, run_name="__main__") # Second epoch only - expected_loss_values = self._fetch_expected_loss_values("llama3")[2:] + expected_loss_values = self._fetch_expected_loss_values(model_ckpt)[2:] loss_values = get_loss_values_from_metric_logger(log_file) # only take the first loss num_losses = int(len(loss_values) / 4) # 2 steps per epoch, 2 epochs @@ -203,8 +195,14 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): @pytest.mark.integration_test @gpu_test(gpu_count=4) + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) def test_training_state_on_resume_with_async_checkpointing( - self, tmpdir, monkeypatch + self, tmpdir, monkeypatch, model_ckpt ): """Test whether the recipe state is correctly updated on resume with async checkpointing. Since this is model agnostic, we should run this on the small model only. The test @@ -214,38 +212,29 @@ def test_training_state_on_resume_with_async_checkpointing( - Make sure final loss matches the expected value of a model successfully resumed from a ckpt """ - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - tokenizer_path = Path(TOKENIZER_PATHS["llama3"]) - - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) # Train for two epochs cmd_1 = f""" tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \ --config llama3_2/8B_to_1B_KD_lora_distributed \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \ - teacher_checkpointer.checkpoint_files=[{ckpt_path}] \ + teacher_checkpointer.checkpoint_files=[model.safetensors] \ teacher_checkpointer.output_dir={tmpdir} \ enable_async_checkpointing=True \ tokenizer.path={tokenizer_path} \ tokenizer.prompt_template=null \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] teacher_config = [ - "teacher_" + config for config in MODEL_TEST_CONFIGS["llama3"] + "teacher_" + config for config in MODEL_TEST_CONFIGS[model_ckpt] ] cmd_1 = ( @@ -254,18 +243,15 @@ def test_training_state_on_resume_with_async_checkpointing( monkeypatch.setattr(sys, "argv", cmd_1) runpy.run_path(TUNE_PATH, run_name="__main__") - # Resume training cmd_2 = f""" tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \ --config llama3_2/8B_to_1B_KD_lora_distributed \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir={ckpt_dir} \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \ - teacher_checkpointer.checkpoint_files=[{ckpt_path}] \ + teacher_checkpointer.checkpoint_files=[model.safetensors] \ teacher_checkpointer.output_dir={tmpdir} \ resume_from_checkpoint=True \ enable_async_checkpointing=True \ @@ -283,7 +269,7 @@ def test_training_state_on_resume_with_async_checkpointing( runpy.run_path(TUNE_PATH, run_name="__main__") # Second epoch only - expected_loss_values = self._fetch_expected_loss_values("llama3")[2:] + expected_loss_values = self._fetch_expected_loss_values(model_ckpt)[2:] loss_values = get_loss_values_from_metric_logger(log_file) # only take the first loss num_losses = int(len(loss_values) / 4) # 2 steps per epoch, 2 epochs @@ -295,36 +281,35 @@ def test_training_state_on_resume_with_async_checkpointing( @pytest.mark.integration_test @gpu_test(gpu_count=4) - def test_save_and_load_merged_weights(self, tmpdir, monkeypatch): - ckpt_type = "tune" - model_type = "llama3" - ckpt_component = CKPT_COMPONENT_MAP[ckpt_type] - ckpt = model_type + "_" + ckpt_type - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS[model_type]) - ckpt_dir = ckpt_path.parent + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) + def test_save_and_load_merged_weights(self, tmpdir, monkeypatch, model_ckpt): + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) cmd = f""" tune run --nnodes 1 --nproc_per_node 4 knowledge_distillation_distributed \ --config llama3_2/8B_to_1B_KD_lora_distributed \ output_dir={tmpdir} \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}] \ + checkpointer.checkpoint_files=[model.safetensors] \ checkpointer.output_dir={tmpdir} \ - teacher_checkpointer._component_={ckpt_component} \ teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \ - teacher_checkpointer.checkpoint_files=[{ckpt_path}] \ + teacher_checkpointer.checkpoint_files=[model.safetensors] \ teacher_checkpointer.output_dir={tmpdir} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ metric_logger.filename={log_file} \ """.split() - model_config = MODEL_TEST_CONFIGS[model_type + "_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] teacher_config = [ - "teacher_" + config for config in MODEL_TEST_CONFIGS[model_type] + "teacher_" + config for config in MODEL_TEST_CONFIGS[model_ckpt] ] cmd = cmd + self._get_test_config_overrides() + model_config + teacher_config @@ -340,7 +325,7 @@ def test_save_and_load_merged_weights(self, tmpdir, monkeypatch): lora_model = config.instantiate(OmegaConf.from_dotlist(model_config).model) # Build base llama3 model for loading merged weights - base_llama3_config = MODEL_TEST_CONFIGS[model_type] + base_llama3_config = MODEL_TEST_CONFIGS[model_ckpt] llama3_model = config.instantiate( OmegaConf.from_dotlist(base_llama3_config).model ) @@ -350,16 +335,18 @@ def test_save_and_load_merged_weights(self, tmpdir, monkeypatch): adpt_path = os.path.join(tmpdir, epoch_folder, f"{ADAPTER_MODEL_FNAME}.pt") lora_sd = safe_torch_load(adpt_path, weights_only=True) - with open(ckpt_path, "rb") as f: - base_model_sd = torch.load(f, weights_only=True) + # Load base model from HF checkpoint + base_model_path = os.path.join(ckpt_dir, "model.safetensors") + base_model_sd = safe_torch_load(base_model_path, weights_only=True) + lora_model.load_state_dict(lora_sd, strict=False) lora_model.load_state_dict(base_model_sd, strict=False) baseline_out = lora_model(inputs) - # Load merged final ckpt directly into 3 and call fwd - suffix = ".safetensors" if ckpt_type == "hf" else ".bin" + # Load merged final ckpt directly into llama3 and call fwd model_ckpt_fname = ( - SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)) + suffix + SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)) + + ".safetensors" ) model_path = os.path.join(tmpdir, epoch_folder, model_ckpt_fname) sd = safe_torch_load(model_path, weights_only=True) diff --git a/tests/recipes/test_knowledge_distillation_single_device.py b/tests/recipes/test_knowledge_distillation_single_device.py index b95aea8306..1ad35bd610 100644 --- a/tests/recipes/test_knowledge_distillation_single_device.py +++ b/tests/recipes/test_knowledge_distillation_single_device.py @@ -13,12 +13,7 @@ import torch from omegaconf import OmegaConf from tests.common import TUNE_PATH -from tests.recipes.utils import ( - CKPT_COMPONENT_MAP, - dummy_alpaca_dataset_config, - MODEL_TEST_CONFIGS, - write_hf_ckpt_config, -) +from tests.recipes.utils import dummy_alpaca_dataset_config, MODEL_TEST_CONFIGS from tests.test_utils import ( CKPT_MODEL_PATHS, gen_log_file_name, @@ -54,17 +49,21 @@ def _get_test_config_overrides(self, dtype_str: str = "fp32", epochs: int = 2): "clip_grad_norm=100", ] + dummy_alpaca_dataset_config() - def _fetch_expected_loss_values(self, model_type): + def _fetch_expected_loss_values(self, model_ckpt): loss_values_map = { - "llama3": [11.7898, 11.7825, 11.7788, 11.7671], + # "llama3_hf_138m": [11.7898, 11.7825, 11.7788, 11.7671], } - return loss_values_map[model_type] + return loss_values_map[model_ckpt] @pytest.mark.integration_test @pytest.mark.parametrize( "micro_batch_size, gradient_accumulation_steps, compile", [(8, 1, False), (2, 4, True), (2, 4, False)], ) + @pytest.mark.parametrize( + "model_ckpt", + [("llama3_hf_138m")], + ) @gpu_test(gpu_count=1) def test_loss( self, @@ -73,15 +72,11 @@ def test_loss( compile, tmpdir, monkeypatch, + model_ckpt, ): config = "qwen2/1.5_to_0.5B_KD_lora_single_device" - model_type = "llama3" - ckpt_type = "tune" - ckpt_component = CKPT_COMPONENT_MAP[ckpt_type] - ckpt = model_type + "_" + ckpt_type - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS[model_type]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) cmd = f""" @@ -90,28 +85,22 @@ def test_loss( output_dir={tmpdir} \ batch_size={micro_batch_size} \ gradient_accumulation_steps={gradient_accumulation_steps} \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}] \ + checkpointer.checkpoint_files=[model.safetensors] \ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ - teacher_checkpointer._component_={ckpt_component} \ teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \ - teacher_checkpointer.checkpoint_files=[{ckpt_path}] \ + teacher_checkpointer.checkpoint_files=[model.safetensors] \ teacher_checkpointer.output_dir={tmpdir} \ - teacher_checkpointer.model_type={model_type.upper()} \ - tokenizer._component_=torchtune.models.llama3.llama3_tokenizer \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ ~tokenizer.merges_file \ - metric_logger._component_=torchtune.training.metric_logging.DiskLogger \ metric_logger.filename={log_file} \ compile={compile} \ """.split() - model_config = MODEL_TEST_CONFIGS[model_type + "_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] teacher_config = [ - "teacher_" + config for config in MODEL_TEST_CONFIGS[model_type] + "teacher_" + config for config in MODEL_TEST_CONFIGS[model_ckpt] ] cmd = ( @@ -132,14 +121,20 @@ def test_loss( # only take the first loss num_losses = int(len(loss_values) / 4) # 2 steps per epoch, 2 epochs loss_values = loss_values[0::num_losses] - expected_loss_values = self._fetch_expected_loss_values(model_type) + expected_loss_values = self._fetch_expected_loss_values(model_ckpt) torch.testing.assert_close( loss_values, expected_loss_values, rtol=1e-5, atol=1e-5 ) @pytest.mark.integration_test @gpu_test(gpu_count=1) - def test_training_state_on_resume(self, tmpdir, monkeypatch): + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) + def test_training_state_on_resume(self, tmpdir, monkeypatch, model_ckpt): """Test whether the recipe state is correctly updated on resume. Since this is model agnostic, we should run this on the small model only. The test consists of three stages: @@ -148,42 +143,29 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): - Make sure final loss matches the expected value of a model successfully resumed from a ckpt """ - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - tokenizer_path = Path(TOKENIZER_PATHS["llama3"]) - - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) # Train for two epochs cmd_1 = f""" tune run knowledge_distillation_single_device \ --config qwen2/1.5_to_0.5B_KD_lora_single_device \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \ - teacher_checkpointer.checkpoint_files=[{ckpt_path}] \ + teacher_checkpointer.checkpoint_files=[model.safetensors] \ teacher_checkpointer.output_dir={tmpdir} \ - teacher_checkpointer.model_type=LLAMA3 \ - tokenizer._component_=torchtune.models.llama3.llama3_tokenizer \ tokenizer.path={tokenizer_path} \ tokenizer.prompt_template=null \ ~tokenizer.merges_file \ - metric_logger._component_=torchtune.training.metric_logging.DiskLogger \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] teacher_config = [ - "teacher_" + config for config in MODEL_TEST_CONFIGS["llama3"] + "teacher_" + config for config in MODEL_TEST_CONFIGS[model_ckpt] ] cmd_1 = ( @@ -200,22 +182,16 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): tune run knowledge_distillation_single_device \ --config qwen2/1.5_to_0.5B_KD_lora_single_device \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir={ckpt_dir} \ - checkpointer.checkpoint_files=[{ckpt_path}]\ - checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} - checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")} + checkpointer.checkpoint_files=[model.safetensors]\ + checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")}\ + checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")}\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \ - teacher_checkpointer.checkpoint_files=[{ckpt_path}] \ + teacher_checkpointer.checkpoint_files=[model.safetensors] \ teacher_checkpointer.output_dir={tmpdir} \ - teacher_checkpointer.model_type=LLAMA3 \ resume_from_checkpoint=True \ - metric_logger._component_=torchtune.training.metric_logging.DiskLogger \ metric_logger.filename={log_file} \ - tokenizer._component_=torchtune.models.llama3.llama3_tokenizer \ tokenizer.path={tokenizer_path} \ tokenizer.prompt_template=null \ ~tokenizer.merges_file \ @@ -231,7 +207,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): runpy.run_path(TUNE_PATH, run_name="__main__") # Second epoch only - expected_loss_values = self._fetch_expected_loss_values("llama3")[2:] + expected_loss_values = self._fetch_expected_loss_values(model_ckpt)[2:] loss_values = get_loss_values_from_metric_logger(log_file) # only take the first loss num_losses = int(len(loss_values) / 4) # 2 steps per epoch, 2 epochs @@ -243,8 +219,14 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): @pytest.mark.integration_test @gpu_test(gpu_count=1) + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) def test_training_state_on_resume_with_async_checkpointing( - self, tmpdir, monkeypatch + self, tmpdir, monkeypatch, model_ckpt ): """Test whether the recipe state is correctly updated on resume. Since this is model agnostic, we should run this on the small model only. The test @@ -254,42 +236,29 @@ def test_training_state_on_resume_with_async_checkpointing( - Make sure final loss matches the expected value of a model successfully resumed from a ckpt """ - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - tokenizer_path = Path(TOKENIZER_PATHS["llama3"]) - - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) # Train for two epochs cmd_1 = f""" tune run knowledge_distillation_single_device \ --config qwen2/1.5_to_0.5B_KD_lora_single_device \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \ - teacher_checkpointer.checkpoint_files=[{ckpt_path}] \ + teacher_checkpointer.checkpoint_files=[model.safetensors] \ teacher_checkpointer.output_dir={tmpdir} \ - teacher_checkpointer.model_type=LLAMA3 \ - tokenizer._component_=torchtune.models.llama3.llama3_tokenizer \ tokenizer.path={tokenizer_path} \ tokenizer.prompt_template=null \ ~tokenizer.merges_file \ - metric_logger._component_=torchtune.training.metric_logging.DiskLogger \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] teacher_config = [ - "teacher_" + config for config in MODEL_TEST_CONFIGS["llama3"] + "teacher_" + config for config in MODEL_TEST_CONFIGS[model_ckpt] ] cmd_1 = ( @@ -306,22 +275,16 @@ def test_training_state_on_resume_with_async_checkpointing( tune run knowledge_distillation_single_device \ --config qwen2/1.5_to_0.5B_KD_lora_single_device \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir={ckpt_dir} \ - checkpointer.checkpoint_files=[{ckpt_path}]\ - checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} - checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")} + checkpointer.checkpoint_files=[model.safetensors]\ + checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")}\ + checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")}\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - teacher_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \ - teacher_checkpointer.checkpoint_files=[{ckpt_path}] \ + teacher_checkpointer.checkpoint_files=[model.safetensors] \ teacher_checkpointer.output_dir={tmpdir} \ - teacher_checkpointer.model_type=LLAMA3 \ resume_from_checkpoint=True \ - metric_logger._component_=torchtune.training.metric_logging.DiskLogger \ metric_logger.filename={log_file} \ - tokenizer._component_=torchtune.models.llama3.llama3_tokenizer \ tokenizer.path={tokenizer_path} \ tokenizer.prompt_template=null \ ~tokenizer.merges_file \ @@ -337,7 +300,7 @@ def test_training_state_on_resume_with_async_checkpointing( runpy.run_path(TUNE_PATH, run_name="__main__") # Second epoch only - expected_loss_values = self._fetch_expected_loss_values("llama3")[2:] + expected_loss_values = self._fetch_expected_loss_values(model_ckpt)[2:] loss_values = get_loss_values_from_metric_logger(log_file) # only take the first loss num_losses = int(len(loss_values) / 4) # 2 steps per epoch, 2 epochs @@ -348,41 +311,37 @@ def test_training_state_on_resume_with_async_checkpointing( ) @pytest.mark.integration_test - def test_save_and_load_merged_weights(self, tmpdir, monkeypatch): - ckpt_type = "tune" - model_type = "llama3" - ckpt_component = CKPT_COMPONENT_MAP[ckpt_type] - ckpt = model_type + "_" + ckpt_type - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS[model_type]) - ckpt_dir = ckpt_path.parent + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) + @gpu_test(gpu_count=1) + def test_save_and_load_merged_weights(self, tmpdir, monkeypatch, model_ckpt): + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) cmd = f""" tune run knowledge_distillation_single_device \ --config qwen2/1.5_to_0.5B_KD_lora_single_device \ output_dir={tmpdir} \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}] \ + checkpointer.checkpoint_files=[model.safetensors] \ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ - teacher_checkpointer._component_={ckpt_component} \ teacher_checkpointer.checkpoint_dir='{ckpt_dir}' \ - teacher_checkpointer.checkpoint_files=[{ckpt_path}] \ + teacher_checkpointer.checkpoint_files=[model.safetensors] \ teacher_checkpointer.output_dir={tmpdir} \ - teacher_checkpointer.model_type={model_type.upper()} \ - tokenizer._component_=torchtune.models.llama3.llama3_tokenizer \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ ~tokenizer.merges_file \ - metric_logger._component_=torchtune.training.metric_logging.DiskLogger \ metric_logger.filename={log_file} \ """.split() - model_config = MODEL_TEST_CONFIGS[model_type + "_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] teacher_config = [ - "teacher_" + config for config in MODEL_TEST_CONFIGS[model_type] + "teacher_" + config for config in MODEL_TEST_CONFIGS[model_ckpt] ] cmd = ( @@ -404,7 +363,7 @@ def test_save_and_load_merged_weights(self, tmpdir, monkeypatch): lora_model = config.instantiate(OmegaConf.from_dotlist(model_config).model) # Build base llama3 model for loading merged weights - base_llama3_config = MODEL_TEST_CONFIGS[model_type] + base_llama3_config = MODEL_TEST_CONFIGS[model_ckpt] llama3_model = config.instantiate( OmegaConf.from_dotlist(base_llama3_config).model ) @@ -414,16 +373,18 @@ def test_save_and_load_merged_weights(self, tmpdir, monkeypatch): adpt_path = os.path.join(tmpdir, epoch_folder, f"{ADAPTER_MODEL_FNAME}.pt") lora_sd = safe_torch_load(adpt_path, weights_only=True) - with open(ckpt_path, "rb") as f: - base_model_sd = torch.load(f, weights_only=True) + # Load base model from HF checkpoint + base_model_path = os.path.join(ckpt_dir, "model.safetensors") + base_model_sd = safe_torch_load(base_model_path, weights_only=True) + lora_model.load_state_dict(lora_sd, strict=False) lora_model.load_state_dict(base_model_sd, strict=False) baseline_out = lora_model(inputs) # Load merged final ckpt directly into 3 and call fwd - suffix = ".safetensors" if ckpt_type == "hf" else ".bin" model_ckpt_fname = ( - SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)) + suffix + SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)) + + ".safetensors" ) model_path = os.path.join(tmpdir, epoch_folder, model_ckpt_fname) sd = safe_torch_load(model_path, weights_only=True) diff --git a/tests/recipes/test_lora_dpo_distributed.py b/tests/recipes/test_lora_dpo_distributed.py index abc1cddc07..ae361cbdbb 100644 --- a/tests/recipes/test_lora_dpo_distributed.py +++ b/tests/recipes/test_lora_dpo_distributed.py @@ -13,11 +13,7 @@ import torch from omegaconf import OmegaConf from tests.common import TUNE_PATH -from tests.recipes.utils import ( - dummy_stack_exchange_dataset_config, - MODEL_TEST_CONFIGS, - write_hf_ckpt_config, -) +from tests.recipes.utils import dummy_stack_exchange_dataset_config, MODEL_TEST_CONFIGS from tests.test_utils import ( CKPT_MODEL_PATHS, gen_log_file_name, diff --git a/tests/recipes/test_lora_dpo_single_device.py b/tests/recipes/test_lora_dpo_single_device.py index e7766d6330..a4bae256cd 100644 --- a/tests/recipes/test_lora_dpo_single_device.py +++ b/tests/recipes/test_lora_dpo_single_device.py @@ -13,16 +13,13 @@ import torch from omegaconf import OmegaConf from tests.common import TUNE_PATH -from tests.recipes.utils import ( - dummy_stack_exchange_dataset_config, - MODEL_TEST_CONFIGS, - write_hf_ckpt_config, -) +from tests.recipes.utils import dummy_stack_exchange_dataset_config, MODEL_TEST_CONFIGS from tests.test_utils import ( CKPT_MODEL_PATHS, gen_log_file_name, get_loss_values_from_metric_logger, gpu_test, + TOKENIZER_PATHS, ) from torchtune import config @@ -54,8 +51,14 @@ def _get_test_config_overrides(self, dtype_str: str = "fp32", epochs: int = 2): @pytest.mark.parametrize("save_adapter_weights_only", [False, True]) @pytest.mark.integration_test @gpu_test(gpu_count=1) + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) def test_training_state_on_resume( - self, tmpdir, monkeypatch, save_adapter_weights_only + self, tmpdir, monkeypatch, save_adapter_weights_only, model_ckpt ): """Test whether the recipe state is correctly updated on resume. Since this is model agnostic, we should run this on the small model only. The test @@ -66,16 +69,10 @@ def test_training_state_on_resume( Unlike `tests.recipes.test_lora_finetune_single_device`, this test does not use pre-computed loss values to benchmark against. This test just ensures the loss values are identical when resuming. """ - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) - # Train for two epochs cmd_1 = f""" tune run lora_dpo_single_device \ @@ -83,12 +80,10 @@ def test_training_state_on_resume( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj'] \ model.apply_lora_to_mlp=False \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ save_adapter_weights_only={save_adapter_weights_only} \ metric_logger.filename={log_file} \ @@ -96,7 +91,7 @@ def test_training_state_on_resume( enable_activation_offloading=False \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd_1 = cmd_1 + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd_1) @@ -117,16 +112,13 @@ def test_training_state_on_resume( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj'] \ model.apply_lora_to_mlp=False \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir={ckpt_dir} \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.adapter_checkpoint={os.path.join(tmpdir, epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} checkpointer.recipe_checkpoint={os.path.join(tmpdir, RECIPE_STATE_DIRNAME, "recipe_state.pt")} checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ resume_from_checkpoint=True \ metric_logger.filename={resumed_log_file} \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ tokenizer.prompt_template=null \ enable_activation_checkpointing=True \ enable_activation_offloading=False \ @@ -145,8 +137,14 @@ def test_training_state_on_resume( @pytest.mark.parametrize("save_adapter_weights_only", [False, True]) @pytest.mark.integration_test + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) def test_training_state_on_resume_with_async_checkpointing( - self, tmpdir, monkeypatch, save_adapter_weights_only + self, tmpdir, monkeypatch, save_adapter_weights_only, model_ckpt ): """Test whether the recipe state is correctly updated on resume. Since this is model agnostic, we should run this on the small model only. The test @@ -157,16 +155,10 @@ def test_training_state_on_resume_with_async_checkpointing( Unlike `tests.recipes.test_lora_finetune_single_device`, this test does not use pre-computed loss values to benchmark against. This test just ensures the loss values are identical when resuming. """ - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) - # Train for two epochs cmd_1 = f""" tune run lora_dpo_single_device \ @@ -174,12 +166,10 @@ def test_training_state_on_resume_with_async_checkpointing( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj'] \ model.apply_lora_to_mlp=False \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ save_adapter_weights_only={save_adapter_weights_only} \ metric_logger.filename={log_file} \ @@ -188,7 +178,7 @@ def test_training_state_on_resume_with_async_checkpointing( enable_async_checkpointing=True \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd_1 = cmd_1 + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd_1) @@ -209,16 +199,14 @@ def test_training_state_on_resume_with_async_checkpointing( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj'] \ model.apply_lora_to_mlp=False \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir={ckpt_dir} \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.adapter_checkpoint={os.path.join(tmpdir, epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} checkpointer.recipe_checkpoint={os.path.join(tmpdir, RECIPE_STATE_DIRNAME, "recipe_state.pt")} checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ resume_from_checkpoint=True \ metric_logger.filename={resumed_log_file} \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ enable_activation_checkpointing=True \ enable_activation_offloading=False \ @@ -238,10 +226,15 @@ def test_training_state_on_resume_with_async_checkpointing( @pytest.mark.integration_test @gpu_test(gpu_count=1) - def test_save_and_load_merged_weights(self, tmpdir, monkeypatch): - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) + def test_save_and_load_merged_weights(self, tmpdir, monkeypatch, model_ckpt): + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) cmd = f""" tune run lora_dpo_single_device \ @@ -249,18 +242,16 @@ def test_save_and_load_merged_weights(self, tmpdir, monkeypatch): output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj'] \ model.apply_lora_to_mlp=False \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ enable_activation_checkpointing=False \ enable_activation_offloading=False \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd = cmd + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd) @@ -276,7 +267,7 @@ def test_save_and_load_merged_weights(self, tmpdir, monkeypatch): lora_model = config.instantiate(OmegaConf.from_dotlist(model_config).model) # Build base llama3 model for loading merged weights - base_llama3_config = MODEL_TEST_CONFIGS["llama3"] + base_llama3_config = MODEL_TEST_CONFIGS[model_ckpt] llama3_model = config.instantiate( OmegaConf.from_dotlist(base_llama3_config).model ) @@ -286,14 +277,16 @@ def test_save_and_load_merged_weights(self, tmpdir, monkeypatch): adpt_path = os.path.join(tmpdir, epoch_folder, f"{ADAPTER_MODEL_FNAME}.pt") lora_sd = safe_torch_load(adpt_path, weights_only=True) - with open(ckpt_path, "rb") as f: - base_model_sd = torch.load(f, weights_only=True) + # Load base model from HF checkpoint + base_model_path = os.path.join(ckpt_dir, "model.safetensors") + base_model_sd = safe_torch_load(base_model_path, weights_only=True) + lora_model.load_state_dict(lora_sd, strict=False) lora_model.load_state_dict(base_model_sd, strict=False) baseline_out = lora_model(inputs) # Load merged final ckpt directly into llama3 and call fwd - suffix = ".bin" + suffix = ".safetensors" model_ckpt_fname = ( SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)) + suffix ) diff --git a/tests/recipes/test_lora_finetune_distributed.py b/tests/recipes/test_lora_finetune_distributed.py index 7e82d31c9c..6b7411300b 100644 --- a/tests/recipes/test_lora_finetune_distributed.py +++ b/tests/recipes/test_lora_finetune_distributed.py @@ -13,12 +13,7 @@ import torch from omegaconf import OmegaConf from tests.common import TUNE_PATH -from tests.recipes.utils import ( - CKPT_COMPONENT_MAP, - dummy_alpaca_dataset_config, - MODEL_TEST_CONFIGS, - write_hf_ckpt_config, -) +from tests.recipes.utils import dummy_alpaca_dataset_config, MODEL_TEST_CONFIGS from tests.test_utils import ( CKPT_MODEL_PATHS, gen_log_file_name, @@ -50,19 +45,20 @@ def _get_test_config_overrides(self): "compile=False", ] + dummy_alpaca_dataset_config() - def _fetch_expected_loss_values(self, model_type): + def _fetch_expected_loss_values(self, model_ckpt): # These values have been validated against single device recipe test via # https://gist.github.com/ebsmothers/f1c3db7c66655a23a91e0290360960c4 + # TODO loss_values_map = { - "llama3": [11.9839, 11.9691, 11.9617, 11.9383], + # "llama3": [11.9839, 11.9691, 11.9617, 11.9383], } - return loss_values_map[model_type] + return loss_values_map[model_ckpt] @pytest.mark.integration_test @gpu_test(gpu_count=2) @pytest.mark.parametrize( - "micro_batch_size, gradient_accumulation_steps, reshard_after_forward", - [(4, 1, True), (1, 4, False)], + "model_ckpt, micro_batch_size, gradient_accumulation_steps, reshard_after_forward", + [("llama3_hf_138m", 4, 1, True), ("llama3_hf_138m", 1, 4, False)], ) def test_loss( self, @@ -71,11 +67,12 @@ def test_loss( reshard_after_forward, tmpdir, monkeypatch, + model_ckpt, ): - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) + cmd = f""" tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora \ @@ -84,26 +81,24 @@ def test_loss( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj'] \ model.apply_lora_to_mlp=False \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ metric_logger.filename={log_file} \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ reshard_after_forward={reshard_after_forward} \ enable_activation_checkpointing=False \ enable_activation_offloading=False \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd = cmd + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd) runpy.run_path(TUNE_PATH, run_name="__main__") loss_values = get_loss_values_from_metric_logger(log_file) - expected_loss_values = self._fetch_expected_loss_values("llama3") + expected_loss_values = self._fetch_expected_loss_values(model_ckpt) torch.testing.assert_close( loss_values, expected_loss_values, rtol=1e-5, atol=1e-5 ) @@ -111,19 +106,13 @@ def test_loss( @pytest.mark.integration_test @gpu_test(gpu_count=2) @pytest.mark.parametrize( - "config, model_type, ckpt_type, save_adapter_weights_only", + "config, model_ckpt, save_adapter_weights_only", [ - ("llama3/8B_lora", "llama3", "tune", False), + ("llama3/8B_lora", "llama3_hf_138m", False), ], ) def test_training_state_on_resume( - self, - config, - model_type, - ckpt_type, - tmpdir, - monkeypatch, - save_adapter_weights_only, + self, config, tmpdir, monkeypatch, save_adapter_weights_only, model_ckpt ): """Test whether the recipe state is correctly updated on resume. Since this is model agnostic, we should run this on the small model only. The test @@ -132,19 +121,11 @@ def test_training_state_on_resume( - Resume training after epoch 1 - Make sure final loss matches the expected value of a model successfully resumed from a ckpt """ - ckpt_component = CKPT_COMPONENT_MAP[ckpt_type] - ckpt = model_type + "_" + ckpt_type - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS[model_type]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) - # Train for two epochs cmd_1 = f""" tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed \ @@ -154,11 +135,9 @@ def test_training_state_on_resume( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj'] \ model.apply_lora_to_mlp=False \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ save_adapter_weights_only={save_adapter_weights_only} \ @@ -166,7 +145,7 @@ def test_training_state_on_resume( enable_activation_offloading=True \ """.split() - model_config = MODEL_TEST_CONFIGS[model_type + "_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd_1 = cmd_1 + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd_1) @@ -183,13 +162,11 @@ def test_training_state_on_resume( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj'] \ model.apply_lora_to_mlp=False \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir={ckpt_dir} \ - checkpointer.checkpoint_files=[{ckpt_path}]\ - checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} - checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")} + checkpointer.checkpoint_files=[model.safetensors]\ + checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")}\ + checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")}\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ resume_from_checkpoint=True \ @@ -202,7 +179,7 @@ def test_training_state_on_resume( monkeypatch.setattr(sys, "argv", cmd_2) runpy.run_path(TUNE_PATH, run_name="__main__") - expected_loss_values = self._fetch_expected_loss_values(model_type)[2:] + expected_loss_values = self._fetch_expected_loss_values(model_ckpt)[2:] loss_values = get_loss_values_from_metric_logger(log_file) torch.testing.assert_close( @@ -212,19 +189,13 @@ def test_training_state_on_resume( @pytest.mark.integration_test @gpu_test(gpu_count=2) @pytest.mark.parametrize( - "config, model_type, ckpt_type, save_adapter_weights_only", + "config, model_ckpt, save_adapter_weights_only", [ - ("llama3/8B_lora", "llama3", "tune", False), + ("llama3/8B_lora", "llama3_hf_138m", False), ], ) def test_training_state_on_resume_with_async_checkpointing( - self, - config, - model_type, - ckpt_type, - tmpdir, - monkeypatch, - save_adapter_weights_only, + self, config, tmpdir, monkeypatch, save_adapter_weights_only, model_ckpt ): """Test whether the recipe state is correctly updated on resume. Since this is model agnostic, we should run this on the small model only. The test @@ -233,19 +204,10 @@ def test_training_state_on_resume_with_async_checkpointing( - Resume training after epoch 1 - Make sure final loss matches the expected value of a model successfully resumed from a ckpt """ - ckpt_component = CKPT_COMPONENT_MAP[ckpt_type] - ckpt = model_type + "_" + ckpt_type - - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS[model_type]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) - # Train for two epochs cmd_1 = f""" tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed \ @@ -255,11 +217,9 @@ def test_training_state_on_resume_with_async_checkpointing( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj'] \ model.apply_lora_to_mlp=False \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ save_adapter_weights_only={save_adapter_weights_only} \ @@ -268,7 +228,7 @@ def test_training_state_on_resume_with_async_checkpointing( enable_async_checkpointing=True \ """.split() - model_config = MODEL_TEST_CONFIGS[model_type + "_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd_1 = cmd_1 + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd_1) @@ -283,11 +243,9 @@ def test_training_state_on_resume_with_async_checkpointing( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj'] \ model.apply_lora_to_mlp=False \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir={ckpt_dir} \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ resume_from_checkpoint=True \ @@ -301,7 +259,7 @@ def test_training_state_on_resume_with_async_checkpointing( monkeypatch.setattr(sys, "argv", cmd_2) runpy.run_path(TUNE_PATH, run_name="__main__") - expected_loss_values = self._fetch_expected_loss_values(model_type)[2:] + expected_loss_values = self._fetch_expected_loss_values(model_ckpt)[2:] loss_values = get_loss_values_from_metric_logger(log_file) torch.testing.assert_close( @@ -310,20 +268,17 @@ def test_training_state_on_resume_with_async_checkpointing( @pytest.mark.integration_test @pytest.mark.parametrize( - "recipe_config, model_type, ckpt_type, use_dora", + "recipe_config, use_dora, model_ckpt", [ - ("llama3/8B_lora", "llama3", "tune", False), + ("llama3/8B_lora", False, "llama3_hf_138m"), ], ) @gpu_test(gpu_count=2) def test_save_and_load_merged_weights( - self, recipe_config, model_type, ckpt_type, use_dora, tmpdir, monkeypatch + self, recipe_config, use_dora, tmpdir, monkeypatch, model_ckpt ): - ckpt_component = CKPT_COMPONENT_MAP[ckpt_type] - ckpt = model_type + "_" + ckpt_type - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS[model_type]) - ckpt_dir = ckpt_path.parent + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) cmd = f""" tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed \ --config {recipe_config} \ @@ -332,19 +287,16 @@ def test_save_and_load_merged_weights( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj'] \ model.apply_lora_to_mlp=False \ - model=torchtune.models.lora_small_test_model \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ enable_activation_checkpointing=True \ enable_activation_offloading=True \ """.split() model_config = MODEL_TEST_CONFIGS[ - model_type + ("_dora" if use_dora else "_lora") + model_ckpt + ("_dora" if use_dora else "_lora") ] cmd = cmd + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd) @@ -359,7 +311,7 @@ def test_save_and_load_merged_weights( lora_model = config.instantiate(OmegaConf.from_dotlist(model_config).model) # Build base model for loading merged weights - base_config = MODEL_TEST_CONFIGS[model_type] + base_config = MODEL_TEST_CONFIGS[model_ckpt] model = config.instantiate(OmegaConf.from_dotlist(base_config).model) # Load base model and trained adapter weights into LoRA model and call fwd @@ -367,15 +319,16 @@ def test_save_and_load_merged_weights( adpt_path = os.path.join(tmpdir, epoch_folder, f"{ADAPTER_MODEL_FNAME}.pt") lora_sd = safe_torch_load(adpt_path, weights_only=True) - with open(ckpt_path, "rb") as f: - base_model_sd = torch.load(f, weights_only=True) + # Load base model from HF checkpoint + base_model_path = os.path.join(ckpt_dir, "model.safetensors") + base_model_sd = safe_torch_load(base_model_path, weights_only=True) lora_model.load_state_dict(lora_sd, strict=False) lora_model.load_state_dict(base_model_sd, strict=False) baseline_out = lora_model(inputs) # Load merged final ckpt directly into model and call fwd - suffix = ".safetensors" if ckpt_type == "hf" else ".bin" + suffix = ".safetensors" model_ckpt_fname = ( SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)) + suffix ) diff --git a/tests/recipes/test_lora_finetune_single_device.py b/tests/recipes/test_lora_finetune_single_device.py index a42c298466..de9d675555 100644 --- a/tests/recipes/test_lora_finetune_single_device.py +++ b/tests/recipes/test_lora_finetune_single_device.py @@ -13,12 +13,7 @@ import torch from omegaconf import OmegaConf from tests.common import TUNE_PATH -from tests.recipes.utils import ( - CKPT_COMPONENT_MAP, - dummy_alpaca_dataset_config, - MODEL_TEST_CONFIGS, - write_hf_ckpt_config, -) +from tests.recipes.utils import dummy_alpaca_dataset_config, MODEL_TEST_CONFIGS from tests.test_utils import ( CKPT_MODEL_PATHS, gen_log_file_name, @@ -50,23 +45,25 @@ def _get_test_config_overrides(self, dtype_str: str = "fp32", epochs: int = 2): "clip_grad_norm=100", ] + dummy_alpaca_dataset_config() - def _fetch_expected_loss_values(self, model_type): + def _fetch_expected_loss_values(self, model_ckpt): + # TODO loss_values_map = { - "llama3": [11.9838, 11.9691, 11.9616, 11.9383], + # "llama3": [11.9838, 11.9691, 11.9616, 11.9383], } - return loss_values_map[model_type] + return loss_values_map[model_ckpt] def _fetch_qlora_expected_loss_values(self, dtype): + # TODO if dtype == "bf16": return [11.9857, 11.9711, 11.9619, 11.9407] return [11.9857, 11.9712, 11.9613, 11.9408] @pytest.mark.integration_test @pytest.mark.parametrize( - "config, model_type, ckpt_type, micro_batch_size, gradient_accumulation_steps, compile", + "config, model_ckpt, micro_batch_size, gradient_accumulation_steps, compile", [ - ("llama3/8B_lora_single_device", "llama3", "tune", 2, 4, True), - ("llama3/8B_lora_single_device", "llama3", "tune", 2, 4, False), + ("llama3/8B_lora_single_device", "llama3_hf_138m", 2, 4, True), + ("llama3/8B_lora_single_device", "llama3_hf_138m", 2, 4, False), ], ) @gpu_test(gpu_count=1) @@ -76,16 +73,12 @@ def test_loss( micro_batch_size, gradient_accumulation_steps, config, - model_type, - ckpt_type, + model_ckpt, tmpdir, monkeypatch, ): - ckpt_component = CKPT_COMPONENT_MAP[ckpt_type] - ckpt = model_type + "_" + ckpt_type - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS[model_type]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) cmd = f""" @@ -96,18 +89,16 @@ def test_loss( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj'] \ model.apply_lora_to_mlp=False \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}] \ + checkpointer.checkpoint_files=[model.safetensors] \ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ metric_logger.filename={log_file} \ compile={compile} \ """.split() - model_config = MODEL_TEST_CONFIGS[model_type + "_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd = cmd + self._get_test_config_overrides(dtype_str="fp32") + model_config monkeypatch.setattr(sys, "argv", cmd) @@ -119,19 +110,19 @@ def test_loss( torch._dynamo.reset() loss_values = get_loss_values_from_metric_logger(log_file) - expected_loss_values = self._fetch_expected_loss_values(model_type) + expected_loss_values = self._fetch_expected_loss_values(model_ckpt) torch.testing.assert_close( loss_values, expected_loss_values, rtol=1e-5, atol=1e-5 ) @pytest.mark.integration_test @pytest.mark.parametrize( - "dtype, compile, micro_batch_size, gradient_accumulation_steps", + "model_ckpt, dtype, compile, micro_batch_size, gradient_accumulation_steps", [ - ("fp32", True, 8, 1), - ("bf16", True, 2, 4), - ("fp32", False, 4, 2), - ("bf16", False, 8, 1), + ("llama3_hf_138m", "fp32", True, 8, 1), + ("llama3_hf_138m", "bf16", True, 2, 4), + ("llama3_hf_138m", "fp32", False, 4, 2), + ("llama3_hf_138m", "bf16", False, 8, 1), ], ) @gpu_test(gpu_count=1) @@ -143,10 +134,10 @@ def test_loss_qlora( gradient_accumulation_steps, tmpdir, monkeypatch, + model_ckpt, ): - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) cmd = f""" @@ -157,20 +148,18 @@ def test_loss_qlora( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj','k_proj','output_proj'] \ model.apply_lora_to_mlp=True \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ metric_logger.filename={log_file} \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ compile={compile} \ enable_activation_checkpointing=False \ enable_activation_offloading=False \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_qlora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_qlora"] cmd = cmd + self._get_test_config_overrides(dtype_str=dtype) + model_config monkeypatch.setattr(sys, "argv", cmd) @@ -190,8 +179,14 @@ def test_loss_qlora( @pytest.mark.parametrize("save_adapter_weights_only", [False, True]) @pytest.mark.integration_test @gpu_test(gpu_count=1) + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) def test_training_state_on_resume( - self, tmpdir, monkeypatch, save_adapter_weights_only + self, tmpdir, monkeypatch, save_adapter_weights_only, model_ckpt ): """Test whether the recipe state is correctly updated on resume. Since this is model agnostic, we should run this on the small model only. The test @@ -201,16 +196,10 @@ def test_training_state_on_resume( - Make sure final loss matches the expected value of a model successfully resumed from a ckpt """ - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) - # Train for two epochs cmd_1 = f""" tune run lora_finetune_single_device \ @@ -220,19 +209,17 @@ def test_training_state_on_resume( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj','k_proj','output_proj'] \ model.apply_lora_to_mlp=True \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ save_adapter_weights_only={save_adapter_weights_only} \ enable_activation_checkpointing=True \ enable_activation_offloading=False \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd_1 = cmd_1 + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd_1) @@ -250,16 +237,14 @@ def test_training_state_on_resume( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj','k_proj','output_proj'] \ model.apply_lora_to_mlp=True \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir={ckpt_dir} \ - checkpointer.checkpoint_files=[{ckpt_path}]\ - checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} - checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")} + checkpointer.checkpoint_files=[model.safetensors]\ + checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} \ + checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")} \ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ resume_from_checkpoint=True \ metric_logger.filename={log_file} \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ enable_activation_checkpointing=True \ enable_activation_offloading=False \ @@ -270,7 +255,7 @@ def test_training_state_on_resume( runpy.run_path(TUNE_PATH, run_name="__main__") # Second epoch only - expected_loss_values = self._fetch_expected_loss_values("llama3")[2:] + expected_loss_values = self._fetch_expected_loss_values(model_ckpt)[2:] loss_values = get_loss_values_from_metric_logger(log_file)[:2] torch.testing.assert_close( @@ -279,8 +264,15 @@ def test_training_state_on_resume( @pytest.mark.parametrize("save_adapter_weights_only", [False, True]) @pytest.mark.integration_test + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) + @gpu_test(gpu_count=1) def test_training_state_on_resume_with_async_checkpointing( - self, tmpdir, monkeypatch, save_adapter_weights_only + self, tmpdir, monkeypatch, save_adapter_weights_only, model_ckpt ): """Test whether the recipe state is correctly updated on resume. Since this is model agnostic, we should run this on the small model only. The test @@ -290,16 +282,10 @@ def test_training_state_on_resume_with_async_checkpointing( - Make sure final loss matches the expected value of a model successfully resumed from a ckpt """ - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) - # Train for two epochs cmd_1 = f""" tune run lora_finetune_single_device \ @@ -309,12 +295,10 @@ def test_training_state_on_resume_with_async_checkpointing( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj','k_proj','output_proj'] \ model.apply_lora_to_mlp=True \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ save_adapter_weights_only={save_adapter_weights_only} \ enable_activation_checkpointing=True \ @@ -322,7 +306,7 @@ def test_training_state_on_resume_with_async_checkpointing( enable_async_checkpointing=True \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd_1 = cmd_1 + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd_1) @@ -338,14 +322,12 @@ def test_training_state_on_resume_with_async_checkpointing( output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj','k_proj','output_proj'] \ model.apply_lora_to_mlp=True \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir={ckpt_dir} \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ resume_from_checkpoint=True \ metric_logger.filename={log_file} \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ enable_activation_checkpointing=True \ enable_activation_offloading=False \ @@ -357,7 +339,7 @@ def test_training_state_on_resume_with_async_checkpointing( runpy.run_path(TUNE_PATH, run_name="__main__") # Second epoch only - expected_loss_values = self._fetch_expected_loss_values("llama3")[2:] + expected_loss_values = self._fetch_expected_loss_values(model_ckpt)[2:] loss_values = get_loss_values_from_metric_logger(log_file)[:2] torch.testing.assert_close( @@ -367,10 +349,17 @@ def test_training_state_on_resume_with_async_checkpointing( @pytest.mark.parametrize("use_dora", [False, True]) @pytest.mark.integration_test @gpu_test(gpu_count=1) - def test_save_and_load_merged_weights(self, tmpdir, monkeypatch, use_dora): - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - ckpt_dir = ckpt_path.parent + @pytest.mark.parametrize( + "model_ckpt", + [ + ("llama3_hf_138m"), + ], + ) + def test_save_and_load_merged_weights( + self, tmpdir, monkeypatch, use_dora, model_ckpt + ): + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) cmd = f""" tune run lora_finetune_single_device \ @@ -378,21 +367,19 @@ def test_save_and_load_merged_weights(self, tmpdir, monkeypatch, use_dora): output_dir={tmpdir} \ model.lora_attn_modules=['q_proj','v_proj','k_proj','output_proj'] \ model.apply_lora_to_mlp=True \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ - tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ enable_activation_checkpointing=True \ enable_activation_offloading=False \ """.split() if use_dora: - model_config = MODEL_TEST_CONFIGS["llama3_dora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_dora"] else: - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd = cmd + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd) @@ -408,7 +395,7 @@ def test_save_and_load_merged_weights(self, tmpdir, monkeypatch, use_dora): lora_model = config.instantiate(OmegaConf.from_dotlist(model_config).model) # Build base llama3 model for loading merged weights - base_llama3_config = MODEL_TEST_CONFIGS["llama3"] + base_llama3_config = MODEL_TEST_CONFIGS[model_ckpt] llama3_model = config.instantiate( OmegaConf.from_dotlist(base_llama3_config).model ) @@ -418,15 +405,18 @@ def test_save_and_load_merged_weights(self, tmpdir, monkeypatch, use_dora): adpt_path = os.path.join(tmpdir, epoch_folder, f"{ADAPTER_MODEL_FNAME}.pt") lora_sd = safe_torch_load(adpt_path, weights_only=True) - with open(ckpt_path, "rb") as f: - base_model_sd = torch.load(f, weights_only=True) + # Load base model from HF checkpoint + base_model_path = os.path.join(ckpt_dir, "model.safetensors") + base_model_sd = safe_torch_load(base_model_path, weights_only=True) + lora_model.load_state_dict(lora_sd, strict=False) lora_model.load_state_dict(base_model_sd, strict=False) baseline_out = lora_model(inputs) # Load merged final ckpt directly into llama3 and call fwd model_ckpt_fname = ( - SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)) + ".bin" + SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)) + + ".safetensors" ) model_path = os.path.join(tmpdir, epoch_folder, model_ckpt_fname) sd = safe_torch_load(model_path, weights_only=True) diff --git a/tests/recipes/test_ppo_full_finetune_single_device.py b/tests/recipes/test_ppo_full_finetune_single_device.py index d2afae4e4f..7ed1c3db75 100644 --- a/tests/recipes/test_ppo_full_finetune_single_device.py +++ b/tests/recipes/test_ppo_full_finetune_single_device.py @@ -16,7 +16,6 @@ from tests.recipes.utils import ( dummy_text_completion_alpaca_dataset_config, MODEL_TEST_CONFIGS, - write_llama3_hf_ckpt_config, ) from tests.test_utils import ( CKPT_MODEL_PATHS, @@ -47,7 +46,6 @@ def _get_test_config_overrides(self): "enable_activation_checkpointing=False", "enable_activation_offloading=False", f"tokenizer.path={TOKENIZER_PATHS['llama3']}", - "tokenizer._component_=torchtune.models.llama3.llama3_tokenizer", "tokenizer.prompt_template=null", "tokenizer.max_seq_len=64", "seed=9", @@ -82,44 +80,41 @@ def _get_expected_loss_values(self): or torch.cuda.get_device_capability() not in ((8, 6)), reason="Unexpected device type", ) + @pytest.mark.parametrize( + "model_ckpt", + [("llama3_hf_138m")], + ) @gpu_test(gpu_count=1) - def test_loss(self, tmpdir, monkeypatch): + def test_loss(self, tmpdir, monkeypatch, model_ckpt): reward_ckpt_path = Path(CKPT_MODEL_PATHS["llama3_reward_hf"]) - policy_ckpt_path = Path(CKPT_MODEL_PATHS["llama3_tune"]) + policy_ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) - ckpt_dir = policy_ckpt_path.parent log_file = gen_log_file_name(tmpdir) policy_tmpdir = (tmpdir / "policy").mkdir() value_tmpdir = (tmpdir / "value").mkdir() - write_llama3_hf_ckpt_config(ckpt_dir) cmd_1 = f""" tune run ppo_full_finetune_single_device \ --config mistral/7B_full_ppo_low_memory \ output_dir={tmpdir} \ - checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ - checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{policy_ckpt_path}]\ + checkpointer.checkpoint_dir='{policy_ckpt_dir}' \ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={policy_tmpdir} \ - checkpointer.model_type=LLAMA3 \ - ref_policy_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ - ref_policy_checkpointer.checkpoint_dir='{ckpt_dir}' \ - ref_policy_checkpointer.checkpoint_files=[{policy_ckpt_path}]\ - ref_policy_checkpointer.model_type=LLAMA3 \ + ref_policy_checkpointer.checkpoint_dir='{policy_ckpt_dir}' \ + ref_policy_checkpointer.checkpoint_files=[model.safetensors]\ - value_checkpointer.checkpoint_dir='{ckpt_dir}' \ - value_checkpointer.checkpoint_files=[{reward_ckpt_path}]\ + value_checkpointer.checkpoint_dir='{reward_ckpt_path.parent}' \ + value_checkpointer.checkpoint_files=[{reward_ckpt_path.name}]\ value_checkpointer.output_dir={value_tmpdir} \ - reward_checkpointer.checkpoint_dir='{ckpt_dir}' \ - reward_checkpointer.checkpoint_files=[{reward_ckpt_path}]\ + reward_checkpointer.checkpoint_dir='{reward_ckpt_path.parent}' \ + reward_checkpointer.checkpoint_files=[{reward_ckpt_path.name}]\ - metric_logger._component_=torchtune.training.metric_logging.DiskLogger \ metric_logger.filename={log_file} \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3"] + model_config = MODEL_TEST_CONFIGS[model_ckpt] model_config = [k.replace("model.", "policy_model.") for k in model_config] reward_and_value_model_config = MODEL_TEST_CONFIGS["llama3_classifier"] @@ -146,24 +141,21 @@ def test_loss(self, tmpdir, monkeypatch): ) @pytest.mark.integration_test + @pytest.mark.parametrize( + "model_ckpt", + [("llama3_hf_138m")], + ) @gpu_test(gpu_count=1) - def test_training_state_on_resume(self, tmpdir, monkeypatch): + def test_training_state_on_resume(self, tmpdir, monkeypatch, model_ckpt): """Test whether the recipe state correctly saved and restored after training.""" reward_ckpt_path = Path(CKPT_MODEL_PATHS["llama3_reward_hf"]) - policy_ckpt_path = Path(CKPT_MODEL_PATHS["llama3_tune"]) + policy_ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) - ckpt_dir = policy_ckpt_path.parent log_file = gen_log_file_name(tmpdir) policy_tmpdir = (tmpdir / "policy").mkdir() value_tmpdir = (tmpdir / "value").mkdir() - # Config file needed for model conversion. - # Create a second copy for training resume - write_llama3_hf_ckpt_config(ckpt_dir) - write_llama3_hf_ckpt_config(policy_tmpdir) - write_llama3_hf_ckpt_config(value_tmpdir) - # There are 4 steps in total (num_steps / batch size) # and the dataset has 8 samples, so each epoch will be 2 batches # a single step is a single batch update, and we checkpoint at every epoch (2 steps) @@ -173,29 +165,24 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): tune run ppo_full_finetune_single_device \ --config mistral/7B_full_ppo_low_memory \ output_dir={tmpdir} \ - checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ - checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{policy_ckpt_path}]\ + checkpointer.checkpoint_dir='{policy_ckpt_dir}' \ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={policy_tmpdir} \ - checkpointer.model_type=LLAMA3 \ - ref_policy_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ - ref_policy_checkpointer.checkpoint_dir='{ckpt_dir}' \ - ref_policy_checkpointer.checkpoint_files=[{policy_ckpt_path}]\ - ref_policy_checkpointer.model_type=LLAMA3 \ + ref_policy_checkpointer.checkpoint_dir='{policy_ckpt_dir}' \ + ref_policy_checkpointer.checkpoint_files=[model.safetensors]\ - value_checkpointer.checkpoint_dir='{ckpt_dir}' \ - value_checkpointer.checkpoint_files=[{reward_ckpt_path}]\ + value_checkpointer.checkpoint_dir='{reward_ckpt_path.parent}' \ + value_checkpointer.checkpoint_files=[{reward_ckpt_path.name}]\ value_checkpointer.output_dir={value_tmpdir} \ - reward_checkpointer.checkpoint_dir='{ckpt_dir}' \ - reward_checkpointer.checkpoint_files=[{reward_ckpt_path}]\ + reward_checkpointer.checkpoint_dir='{reward_ckpt_path.parent}' \ + reward_checkpointer.checkpoint_files=[{reward_ckpt_path.name}]\ - metric_logger._component_=torchtune.training.metric_logging.DiskLogger \ metric_logger.filename={log_file} \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3"] + model_config = MODEL_TEST_CONFIGS[model_ckpt] model_config = [k.replace("model.", "policy_model.") for k in model_config] reward_and_value_model_config = MODEL_TEST_CONFIGS["llama3_classifier"] @@ -222,7 +209,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): epoch_folder = get_largest_iter_folder(value_tmpdir) epoch_folder_minus_one = f"epoch_{int(epoch_folder.split('_')[-1]) - 1}" - policy_suffix = ".bin" + policy_suffix = ".safetensors" value_suffix = ".safetensors" policy_model_ckpt_fname = ( SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)) @@ -236,27 +223,22 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): tune run ppo_full_finetune_single_device \ --config mistral/7B_full_ppo_low_memory \ output_dir={tmpdir} \ - checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ - checkpointer.checkpoint_dir='{ckpt_dir}' \ + checkpointer.checkpoint_dir='{policy_tmpdir}' \ checkpointer.checkpoint_files=[{os.path.join(epoch_folder_minus_one, policy_model_ckpt_fname)}]\ checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")}\ checkpointer.output_dir={policy_tmpdir} \ - checkpointer.model_type=LLAMA3 \ - ref_policy_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ - ref_policy_checkpointer.checkpoint_dir='{ckpt_dir}' \ - ref_policy_checkpointer.checkpoint_files=[{policy_ckpt_path}]\ - ref_policy_checkpointer.model_type=LLAMA3 \ + ref_policy_checkpointer.checkpoint_dir='{policy_ckpt_dir}' \ + ref_policy_checkpointer.checkpoint_files=[model.safetensors]\ - value_checkpointer.checkpoint_dir='{ckpt_dir}' \ - value_checkpointer.checkpoint_files=[{os.path.join(value_tmpdir, epoch_folder_minus_one, value_model_ckpt_fname)}]\ + value_checkpointer.checkpoint_dir='{value_tmpdir}' \ + value_checkpointer.checkpoint_files=[{os.path.join(epoch_folder_minus_one, value_model_ckpt_fname)}]\ value_checkpointer.output_dir={value_tmpdir} \ - reward_checkpointer.checkpoint_dir='{ckpt_dir}' \ - reward_checkpointer.checkpoint_files=[{reward_ckpt_path}]\ + reward_checkpointer.checkpoint_dir='{reward_ckpt_path.parent}' \ + reward_checkpointer.checkpoint_files=[{reward_ckpt_path.name}]\ resume_from_checkpoint=True \ - metric_logger._component_=torchtune.training.metric_logging.DiskLogger \ metric_logger.filename={resumed_log_file} \ """.split() @@ -279,8 +261,14 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): ) @pytest.mark.integration_test + @pytest.mark.parametrize( + "model_ckpt", + [("llama3_hf_138m")], + ) @gpu_test(gpu_count=1) - def test_training_state_on_resume_with_optimizer_in_bwd(self, tmpdir, monkeypatch): + def test_training_state_on_resume_with_optimizer_in_bwd( + self, tmpdir, monkeypatch, model_ckpt + ): """Test whether the recipe state correctly saves and restores optimizer state when using ``optimizer_in_bwd``, since the optimizer checkpoint dict will include parameters for two models. @@ -289,47 +277,36 @@ def test_training_state_on_resume_with_optimizer_in_bwd(self, tmpdir, monkeypatc """ reward_ckpt_path = Path(CKPT_MODEL_PATHS["llama3_reward_hf"]) - policy_ckpt_path = Path(CKPT_MODEL_PATHS["llama3_tune"]) + policy_ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) - ckpt_dir = policy_ckpt_path.parent log_file = gen_log_file_name(tmpdir) policy_tmpdir = (tmpdir / "policy").mkdir() value_tmpdir = (tmpdir / "value").mkdir() - # Config file needed for model conversion. - # Create a second copy for training resume - write_llama3_hf_ckpt_config(ckpt_dir) - write_llama3_hf_ckpt_config(policy_tmpdir) - write_llama3_hf_ckpt_config(value_tmpdir) cmd_1 = f""" tune run ppo_full_finetune_single_device \ --config mistral/7B_full_ppo_low_memory \ output_dir={tmpdir} \ - checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ - checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{policy_ckpt_path}]\ + checkpointer.checkpoint_dir='{policy_ckpt_dir}' \ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={policy_tmpdir} \ - checkpointer.model_type=LLAMA3 \ - ref_policy_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ - ref_policy_checkpointer.checkpoint_dir='{ckpt_dir}' \ - ref_policy_checkpointer.checkpoint_files=[{policy_ckpt_path}]\ - ref_policy_checkpointer.model_type=LLAMA3 \ + ref_policy_checkpointer.checkpoint_dir='{policy_ckpt_dir}' \ + ref_policy_checkpointer.checkpoint_files=[model.safetensors]\ - value_checkpointer.checkpoint_dir='{ckpt_dir}' \ - value_checkpointer.checkpoint_files=[{reward_ckpt_path}]\ + value_checkpointer.checkpoint_dir='{reward_ckpt_path.parent}' \ + value_checkpointer.checkpoint_files=[{reward_ckpt_path.name}]\ value_checkpointer.output_dir={value_tmpdir} \ - reward_checkpointer.checkpoint_dir='{ckpt_dir}' \ - reward_checkpointer.checkpoint_files=[{reward_ckpt_path}]\ + reward_checkpointer.checkpoint_dir='{reward_ckpt_path.parent}' \ + reward_checkpointer.checkpoint_files=[{reward_ckpt_path.name}]\ - metric_logger._component_=torchtune.training.metric_logging.DiskLogger \ metric_logger.filename={log_file} \ optimizer_in_bwd=True """.split() - model_config = MODEL_TEST_CONFIGS["llama3"] + model_config = MODEL_TEST_CONFIGS[model_ckpt] model_config = [k.replace("model.", "policy_model.") for k in model_config] reward_and_value_model_config = MODEL_TEST_CONFIGS["llama3_classifier"] @@ -357,7 +334,7 @@ def test_training_state_on_resume_with_optimizer_in_bwd(self, tmpdir, monkeypatc epoch_folder = get_largest_iter_folder(value_tmpdir) epoch_folder_minus_one = f"epoch_{int(epoch_folder.split('_')[-1]) - 1}" - policy_suffix = ".bin" + policy_suffix = ".safetensors" value_suffix = ".safetensors" policy_model_ckpt_fname = ( SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)) @@ -371,27 +348,22 @@ def test_training_state_on_resume_with_optimizer_in_bwd(self, tmpdir, monkeypatc tune run ppo_full_finetune_single_device \ --config mistral/7B_full_ppo_low_memory \ output_dir={tmpdir} \ - checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ - checkpointer.checkpoint_dir='{ckpt_dir}' \ + checkpointer.checkpoint_dir='{policy_tmpdir}' \ checkpointer.checkpoint_files=[{os.path.join(epoch_folder_minus_one, policy_model_ckpt_fname)}]\ checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")}\ checkpointer.output_dir={policy_tmpdir} \ - checkpointer.model_type=LLAMA3 \ - ref_policy_checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ - ref_policy_checkpointer.checkpoint_dir='{ckpt_dir}' \ - ref_policy_checkpointer.checkpoint_files=[{policy_ckpt_path}]\ - ref_policy_checkpointer.model_type=LLAMA3 \ + ref_policy_checkpointer.checkpoint_dir='{policy_ckpt_dir}' \ + ref_policy_checkpointer.checkpoint_files=[model.safetensors]\ - value_checkpointer.checkpoint_dir='{ckpt_dir}' \ - value_checkpointer.checkpoint_files=[{os.path.join(value_tmpdir, epoch_folder_minus_one, value_model_ckpt_fname)}]\ + value_checkpointer.checkpoint_dir='{value_tmpdir}' \ + value_checkpointer.checkpoint_files=[{os.path.join(epoch_folder_minus_one, value_model_ckpt_fname)}]\ value_checkpointer.output_dir={value_tmpdir} \ - reward_checkpointer.checkpoint_dir='{ckpt_dir}' \ - reward_checkpointer.checkpoint_files=[{reward_ckpt_path}]\ + reward_checkpointer.checkpoint_dir='{reward_ckpt_path.parent}' \ + reward_checkpointer.checkpoint_files=[{reward_ckpt_path.name}]\ resume_from_checkpoint=True \ - metric_logger._component_=torchtune.training.metric_logging.DiskLogger \ metric_logger.filename={resumed_log_file} \ optimizer_in_bwd=True diff --git a/tests/recipes/test_qat_distributed.py b/tests/recipes/test_qat_distributed.py index 1ea8544fd9..083c82175b 100644 --- a/tests/recipes/test_qat_distributed.py +++ b/tests/recipes/test_qat_distributed.py @@ -3,7 +3,6 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - import runpy import sys @@ -13,12 +12,7 @@ import torch from tests.common import TUNE_PATH -from tests.recipes.utils import ( - CKPT_COMPONENT_MAP, - dummy_alpaca_dataset_config, - MODEL_TEST_CONFIGS, - write_hf_ckpt_config, -) +from tests.recipes.utils import dummy_alpaca_dataset_config, MODEL_TEST_CONFIGS from tests.test_utils import ( CKPT_MODEL_PATHS, gen_log_file_name, @@ -43,68 +37,60 @@ def _get_test_config_overrides(self): "log_every_n_steps=1", ] + dummy_alpaca_dataset_config() - def _fetch_expected_loss_values(self, model_type): + def _fetch_expected_loss_values(self, model_ckpt): loss_values_map = { - "llama3": [ + "llama3_hf_138m": [ + # TODO 11.977460861206055, 11.978384017944336, 11.946539878845215, 11.909686088562012, ], } - return loss_values_map[model_type] + return loss_values_map[model_ckpt] @pytest.mark.integration_test @pytest.mark.parametrize( - "config, model_type, ckpt_type, micro_batch_size, gradient_accumulation_steps", + "config, micro_batch_size, model_ckpt, gradient_accumulation_steps", [ - ("llama3/8B_qat_full", "llama3", "tune", 4, 1), - ("llama3/8B_qat_full", "llama3", "tune", 1, 4), + ("llama3/8B_qat_full", "llama3_hf_138m", 4, 1), + ("llama3/8B_qat_full", "llama3_hf_138m", 1, 4), ], ) @gpu_test(gpu_count=4) def test_loss( self, config, - model_type, - ckpt_type, + model_ckpt, micro_batch_size, gradient_accumulation_steps, tmpdir, monkeypatch, ): - ckpt_component = CKPT_COMPONENT_MAP[ckpt_type] - ckpt = model_type + "_" + ckpt_type - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS[model_type]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - # Config file needed for model conversion. - write_hf_ckpt_config(ckpt_dir) - cmd = f""" tune run --nnodes 1 --nproc_per_node 4 qat_distributed \ --config {config} \ output_dir={tmpdir} \ batch_size={micro_batch_size} \ gradient_accumulation_steps={gradient_accumulation_steps} \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ metric_logger.filename={log_file} \ """.split() - model_config = MODEL_TEST_CONFIGS[model_type] + model_config = MODEL_TEST_CONFIGS[model_ckpt] cmd = cmd + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd) runpy.run_path(TUNE_PATH, run_name="__main__") loss_values = get_loss_values_from_metric_logger(log_file) - expected_loss_values = self._fetch_expected_loss_values(model_type) + expected_loss_values = self._fetch_expected_loss_values(model_ckpt) torch.testing.assert_close( loss_values, expected_loss_values, rtol=1e-3, atol=1e-3 diff --git a/tests/recipes/test_qat_lora_finetune_distributed.py b/tests/recipes/test_qat_lora_finetune_distributed.py index 39f7bf9ed9..052ed7e358 100644 --- a/tests/recipes/test_qat_lora_finetune_distributed.py +++ b/tests/recipes/test_qat_lora_finetune_distributed.py @@ -13,12 +13,7 @@ import torch from omegaconf import OmegaConf from tests.common import TUNE_PATH -from tests.recipes.utils import ( - CKPT_COMPONENT_MAP, - dummy_alpaca_dataset_config, - MODEL_TEST_CONFIGS, - write_hf_ckpt_config, -) +from tests.recipes.utils import dummy_alpaca_dataset_config, MODEL_TEST_CONFIGS from tests.test_utils import ( CKPT_MODEL_PATHS, gen_log_file_name, @@ -50,22 +45,23 @@ def _get_test_config_overrides(self): "compile=False", ] + dummy_alpaca_dataset_config() - def _fetch_expected_loss_values(self, model_type): + def _fetch_expected_loss_values(self, model_ckpt): loss_values_map = { - "llama3": [ + "llama3_hf_138m": [ + # TODO 11.977421760559082, 11.979637145996094, 11.948746681213379, 11.912514686584473, ], } - return loss_values_map[model_type] + return loss_values_map[model_ckpt] @pytest.mark.integration_test @gpu_test(gpu_count=4) @pytest.mark.parametrize( - "micro_batch_size, gradient_accumulation_steps, should_compile", - [(4, 1, True), (1, 4, False)], + "model_ckpt, micro_batch_size, gradient_accumulation_steps, should_compile", + [("llama3_hf_138m", 4, 1, True), ("llama3_hf_138m", 1, 4, False)], ) def test_loss( self, @@ -74,11 +70,10 @@ def test_loss( should_compile, tmpdir, monkeypatch, + model_ckpt, ): - ckpt = "llama3_tune" - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS["llama3"]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) cmd = f""" tune run --nnodes 1 --nproc_per_node 4 qat_lora_finetune_distributed @@ -86,13 +81,11 @@ def test_loss( batch_size={micro_batch_size} \ gradient_accumulation_steps={gradient_accumulation_steps} \ output_dir={tmpdir} \ - checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type=LLAMA3 \ metric_logger.filename={log_file} \ - tokenizer.path={tokenizer_path} \ + tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ compile={should_compile} \ enable_activation_checkpointing=False \ @@ -100,14 +93,14 @@ def test_loss( quantizer.groupsize=32 \ """.split() - model_config = MODEL_TEST_CONFIGS["llama3_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd = cmd + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd) runpy.run_path(TUNE_PATH, run_name="__main__") loss_values = get_loss_values_from_metric_logger(log_file) - expected_loss_values = self._fetch_expected_loss_values("llama3") + expected_loss_values = self._fetch_expected_loss_values(model_ckpt) torch.testing.assert_close( loss_values, expected_loss_values, rtol=1e-5, atol=1e-5 ) @@ -115,16 +108,15 @@ def test_loss( @pytest.mark.integration_test @gpu_test(gpu_count=4) @pytest.mark.parametrize( - "config, model_type, ckpt_type, save_adapter_weights_only", + "config, model_ckpt, save_adapter_weights_only", [ - ("llama3/8B_qat_lora", "llama3", "tune", False), + ("llama3/8B_qat_lora", "llama3_hf_138m", False), ], ) def test_training_state_on_resume( self, config, - model_type, - ckpt_type, + model_ckpt, tmpdir, monkeypatch, save_adapter_weights_only, @@ -136,20 +128,12 @@ def test_training_state_on_resume( - Resume training after epoch 1 - Make sure final loss matches the expected value of a model successfully resumed from a ckpt """ - ckpt_component = CKPT_COMPONENT_MAP[ckpt_type] - ckpt = model_type + "_" + ckpt_type - expected_loss_values = self._fetch_expected_loss_values(model_type) + expected_loss_values = self._fetch_expected_loss_values(model_ckpt) - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS[model_type]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) - # Train for two epochs cmd_1 = f""" tune run --nnodes 1 --nproc_per_node 4 qat_lora_finetune_distributed \ @@ -157,11 +141,9 @@ def test_training_state_on_resume( batch_size=4 \ gradient_accumulation_steps=1 \ output_dir={tmpdir} \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ save_adapter_weights_only={save_adapter_weights_only} \ @@ -170,7 +152,7 @@ def test_training_state_on_resume( quantizer.groupsize=32 \ """.split() - model_config = MODEL_TEST_CONFIGS[model_type + "_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd_1 = cmd_1 + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd_1) @@ -185,13 +167,11 @@ def test_training_state_on_resume( batch_size=4 \ gradient_accumulation_steps=1 \ output_dir={tmpdir} \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir={ckpt_dir} \ - checkpointer.checkpoint_files=[{ckpt_path}]\ - checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} - checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")} + checkpointer.checkpoint_files=[model.safetensors]\ + checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} \ + checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")} \ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ resume_from_checkpoint=True \ @@ -205,7 +185,7 @@ def test_training_state_on_resume( monkeypatch.setattr(sys, "argv", cmd_2) runpy.run_path(TUNE_PATH, run_name="__main__") - expected_loss_values = self._fetch_expected_loss_values(model_type)[2:] + expected_loss_values = self._fetch_expected_loss_values(model_ckpt)[2:] loss_values = get_loss_values_from_metric_logger(log_file) torch.testing.assert_close( @@ -215,16 +195,15 @@ def test_training_state_on_resume( @pytest.mark.integration_test @gpu_test(gpu_count=2) @pytest.mark.parametrize( - "config, model_type, ckpt_type, save_adapter_weights_only", + "config, model_ckpt, save_adapter_weights_only", [ - ("llama3/8B_qat_lora", "llama3", "tune", False), + ("llama3/8B_qat_lora", "llama3_hf_138m", False), ], ) def test_training_state_on_resume_with_async_checkpointing( self, config, - model_type, - ckpt_type, + model_ckpt, tmpdir, monkeypatch, save_adapter_weights_only, @@ -236,20 +215,12 @@ def test_training_state_on_resume_with_async_checkpointing( - Resume training after epoch 1 - Make sure final loss matches the expected value of a model successfully resumed from a ckpt """ - ckpt_component = CKPT_COMPONENT_MAP[ckpt_type] - ckpt = model_type + "_" + ckpt_type - expected_loss_values = self._fetch_expected_loss_values(model_type) + expected_loss_values = self._fetch_expected_loss_values(model_ckpt) - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS[model_type]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - # Config file needed for model conversion. - # Create a second copy for training resume - write_hf_ckpt_config(ckpt_dir) - write_hf_ckpt_config(tmpdir) - # Train for two epochs cmd_1 = f""" tune run --nnodes 1 --nproc_per_node 2 qat_lora_finetune_distributed \ @@ -257,11 +228,9 @@ def test_training_state_on_resume_with_async_checkpointing( batch_size=8 \ gradient_accumulation_steps=1 \ output_dir={tmpdir} \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ save_adapter_weights_only={save_adapter_weights_only} \ @@ -271,7 +240,7 @@ def test_training_state_on_resume_with_async_checkpointing( quantizer.groupsize=32 \ """.split() - model_config = MODEL_TEST_CONFIGS[model_type + "_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd_1 = cmd_1 + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd_1) @@ -286,13 +255,11 @@ def test_training_state_on_resume_with_async_checkpointing( batch_size=8 \ gradient_accumulation_steps=1 \ output_dir={tmpdir} \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir={ckpt_dir} \ - checkpointer.checkpoint_files=[{ckpt_path}]\ - checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} - checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")} + checkpointer.checkpoint_files=[model.safetensors]\ + checkpointer.adapter_checkpoint={os.path.join(epoch_folder_minus_one, f"{ADAPTER_MODEL_FNAME}.pt")} \ + checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")} \ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ resume_from_checkpoint=True \ @@ -307,7 +274,7 @@ def test_training_state_on_resume_with_async_checkpointing( monkeypatch.setattr(sys, "argv", cmd_2) runpy.run_path(TUNE_PATH, run_name="__main__") - expected_loss_values = self._fetch_expected_loss_values(model_type)[2:] + expected_loss_values = self._fetch_expected_loss_values(model_ckpt)[2:] loss_values = get_loss_values_from_metric_logger(log_file) torch.testing.assert_close( @@ -316,32 +283,26 @@ def test_training_state_on_resume_with_async_checkpointing( @pytest.mark.integration_test @pytest.mark.parametrize( - "recipe_config, model_type, ckpt_type", + "recipe_config, model_ckpt", [ - ("llama3/8B_qat_lora", "llama3", "tune"), + ("llama3/8B_qat_lora", "llama3_hf_138m"), ], ) @gpu_test(gpu_count=4) def test_save_and_load_merged_weights( - self, recipe_config, model_type, ckpt_type, tmpdir, monkeypatch + self, recipe_config, model_ckpt, tmpdir, monkeypatch ): - ckpt_component = CKPT_COMPONENT_MAP[ckpt_type] - ckpt = model_type + "_" + ckpt_type - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS[model_type]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) cmd = f""" tune run --nnodes 1 --nproc_per_node 4 qat_lora_finetune_distributed \ --config {recipe_config} \ batch_size=4 \ gradient_accumulation_steps=1 \ output_dir={tmpdir} \ - model=torchtune.models.lora_small_test_model \ - checkpointer._component_={ckpt_component} \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.checkpoint_files=[model.safetensors]\ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ enable_activation_checkpointing=True \ @@ -349,7 +310,7 @@ def test_save_and_load_merged_weights( quantizer.groupsize=32 \ """.split() - model_config = MODEL_TEST_CONFIGS[model_type + "_lora"] + model_config = MODEL_TEST_CONFIGS[model_ckpt + "_lora"] cmd = cmd + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd) @@ -364,7 +325,7 @@ def test_save_and_load_merged_weights( lora_model = config.instantiate(OmegaConf.from_dotlist(model_config).model) # Build base model for loading merged weights - base_config = MODEL_TEST_CONFIGS[model_type] + base_config = MODEL_TEST_CONFIGS[model_ckpt] model = config.instantiate(OmegaConf.from_dotlist(base_config).model) # Load base model and trained adapter weights into LoRA model and call fwd @@ -372,14 +333,16 @@ def test_save_and_load_merged_weights( adpt_path = os.path.join(tmpdir, epoch_folder, f"{ADAPTER_MODEL_FNAME}.pt") lora_sd = safe_torch_load(adpt_path, weights_only=True) - with open(ckpt_path, "rb") as f: - base_model_sd = torch.load(f, weights_only=True) + # Load base model from HF checkpoint + base_model_path = os.path.join(ckpt_dir, "model.safetensors") + base_model_sd = safe_torch_load(base_model_path, weights_only=True) + lora_model.load_state_dict(lora_sd, strict=False) lora_model.load_state_dict(base_model_sd, strict=False) baseline_out = lora_model(inputs) # Load merged final ckpt directly into model and call fwd - suffix = ".safetensors" if ckpt_type == "hf" else ".bin" + suffix = ".safetensors" model_ckpt_fname = ( SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)) + suffix ) diff --git a/tests/recipes/test_qat_single_device.py b/tests/recipes/test_qat_single_device.py index e9c6fe52fc..0244a22b6d 100644 --- a/tests/recipes/test_qat_single_device.py +++ b/tests/recipes/test_qat_single_device.py @@ -12,11 +12,7 @@ import torch from tests.common import TUNE_PATH -from tests.recipes.utils import ( - dummy_alpaca_dataset_config, - MODEL_TEST_CONFIGS, - write_hf_ckpt_config, -) +from tests.recipes.utils import dummy_alpaca_dataset_config, MODEL_TEST_CONFIGS from tests.test_utils import ( CKPT_MODEL_PATHS, gen_log_file_name, @@ -40,53 +36,47 @@ def _get_test_config_overrides(self): "log_every_n_steps=1", ] + dummy_alpaca_dataset_config() - def _fetch_expected_loss_values(self, model_type, ckpt_type): - # logic here may need to be adjusted in the future - return [12.0118, 11.9262, 11.8976, 11.9700] + def _fetch_expected_loss_values(self, model_ckpt): + expected_losses = { + "llama3": [12.0118, 11.9262, 11.8976, 11.9700], + "llama3_hf_138m": [], + } + return expected_losses[model_ckpt] @pytest.mark.integration_test @gpu_test(gpu_count=1) @pytest.mark.parametrize( - "model_type, ckpt_type, micro_batch_size, gradient_accumulation_steps", + "model_ckpt, micro_batch_size, gradient_accumulation_steps", [ - ("llama3", "tune", 1, 1), + ("llama3_hf_138m", 1, 1), ], ) def test_loss( self, - model_type, - ckpt_type, + model_ckpt, micro_batch_size, gradient_accumulation_steps, tmpdir, monkeypatch, ): - ckpt = model_type + "_" + ckpt_type - ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) - tokenizer_path = Path(TOKENIZER_PATHS[model_type]) - ckpt_dir = ckpt_path.parent + ckpt_dir = Path(CKPT_MODEL_PATHS[model_ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS[model_ckpt]) log_file = gen_log_file_name(tmpdir) - # Config file needed for model conversion. - write_hf_ckpt_config(ckpt_dir) - cmd = f""" tune run qat_single_device \ --config llama2/1B_qat_single_device \ output_dir={tmpdir} \ batch_size={micro_batch_size} \ gradient_accumulation_steps={gradient_accumulation_steps} \ - checkpointer._component_=torchtune.training.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ - checkpointer.checkpoint_files=[{ckpt_path}] \ + checkpointer.checkpoint_files=[model.safetensors] \ checkpointer.output_dir={tmpdir} \ - checkpointer.model_type={model_type.upper()} \ - tokenizer._component_=torchtune.models.llama3.llama3_tokenizer \ tokenizer.path='{tokenizer_path}' \ tokenizer.prompt_template=null \ metric_logger.filename={log_file} \ """.split() - model_config = MODEL_TEST_CONFIGS[model_type] + model_config = MODEL_TEST_CONFIGS[model_ckpt] cmd = cmd + self._get_test_config_overrides() + model_config monkeypatch.setattr(sys, "argv", cmd) @@ -94,5 +84,5 @@ def test_loss( runpy.run_path(TUNE_PATH, run_name="__main__") loss_values = get_loss_values_from_metric_logger(log_file) - expected_losses = self._fetch_expected_loss_values(model_type, ckpt_type) + expected_losses = self._fetch_expected_loss_values(model_ckpt) torch.testing.assert_close(loss_values, expected_losses, rtol=1e-3, atol=1e-3) diff --git a/tests/recipes/utils.py b/tests/recipes/utils.py index 2303a8be4a..209e0124cc 100644 --- a/tests/recipes/utils.py +++ b/tests/recipes/utils.py @@ -281,6 +281,37 @@ def lora_llama3_test_config( return config_overrides +def lora_llama3_test_config_138m( + apply_lora_to_mlp=False, + apply_lora_to_output=False, + lora_rank=8, + lora_alpha=16, + quantize_base: bool = False, + use_dora: bool = False, +) -> list[str]: + """ + Test config with slightly larger embed dim to be paged and flex attention friendly + """ + return [ + f"model.lora_rank={lora_rank}", + f"model.lora_alpha={lora_alpha}", + f"model.lora_attn_modules={lora_attn_modules}", + f"model.apply_lora_to_mlp={apply_lora_to_mlp}", + f"model.apply_lora_to_output={apply_lora_to_output}", + "model._component_=torchtune.models.llama3.lora_llama3", + "model.vocab_size=128_256", + "model.num_layers=2", + "model.num_heads=16", + "model.embed_dim=512", + "model.max_seq_len=1024", + "model.norm_eps=1e-5", + "model.num_kv_heads=8", + "model.lora_dropout=0.0", + f"model.quantize_base={quantize_base}", + f"model.use_dora={use_dora}", + ] + + def write_hf_ckpt_config(ckpt_dir: Union[str, Path]): config = { "hidden_size": 256, @@ -376,4 +407,27 @@ def write_hf_vision_ckpt_config(ckpt_dir: str): use_dora=True, ), "llama3_hf_138m": llama3_test_config_138m(), + "llama3_hf_138m_lora": lora_llama3_test_config_138m( + lora_attn_modules=["q_proj", "k_proj", "v_proj", "output_proj"], + apply_lora_to_mlp=False, + apply_lora_to_output=False, + lora_rank=8, + lora_alpha=16, + ), + "llama3_hf_138m_qlora": lora_llama3_test_config_138m( + lora_attn_modules=["q_proj", "k_proj", "v_proj", "output_proj"], + apply_lora_to_mlp=False, + apply_lora_to_output=False, + lora_rank=8, + lora_alpha=16, + quantize_base=True, + ), + "llama3_hf_138m_dora": lora_llama3_test_config_138m( + lora_attn_modules=["q_proj", "k_proj", "v_proj", "output_proj"], + apply_lora_to_mlp=False, + apply_lora_to_output=False, + lora_rank=8, + lora_alpha=16, + use_dora=True, + ), }