Merge branch 'master' into pagaray_master_tweak

pablo-garay · web-flow · commit 66c65609d25b · 2023-12-02T23:38:01.000-08:00
diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml
@@ -56,7 +56,6 @@ env_vars:
   TRANSFORMERS_OFFLINE: 1
   TORCH_NCCL_AVOID_RECORD_STREAMS: 1
   NCCL_NVLS_ENABLE: 0
-  NVTE_APPLY_QK_LAYER_SCALING: 1
 
 # GPU Mapping
 numa_mapping:
diff --git a/launcher_scripts/conf/data_preparation/generic/custom_dataset.yaml b/launcher_scripts/conf/data_preparation/generic/custom_dataset.yaml
@@ -26,6 +26,10 @@ bpe_save_dir: ${.custom_dataset_dir}/bpe # Dir to save sentence piece tokenizer
 preprocess_data: True  # True to preprocess the data from json, jsonl or json.gz files, False otherwise.
 raw_dataset_files: # Either a string (path to dataset folder) or a list (of files)
   - null # Each file should be input json, jsonl or json.gz file
+tokenizer_library: sentencepiece  # Name of the tokenizer library, such as "sentencepiece" or "megatron"
+tokenizer_type: null  # Type of tokenizer to use if not training a tokenizer from scratch, such as "GPT2BPETokenizer"
 tokenizer_model: ${.bpe_save_dir}/${data_preparation.train_tokenizer_args.model_prefix}.model # trained SentencePiece tokenizer model
+vocab_file: null  # Path to a vocab file if using BPE tokenizer. Leave "null" if not using BPE.
+merges_file: null  # Path to a merges file if using BPE tokenizer. Leave "null" if not using BPE.
 preprocess_worker_mapping: ${.custom_dataset_dir}/preprocess_mapping
-preprocessed_dir: ${.custom_dataset_dir}/preprocessed
+preprocessed_dir: ${.custom_dataset_dir}/preprocessed
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/custom_dataprep/preprocess.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/custom_dataprep/preprocess.py
@@ -50,11 +50,41 @@
         type=int,
     )
     parser.add_argument("--bcp", action="store_true", help="Whether on BCP platform")
+    parser.add_argument(
+        "--vocab-file",
+        default=None,
+        help="If using BPE tokenizer, specify the path to a vocab file. Keep None if not using BPE.",
+        type=str,
+    )
+    parser.add_argument(
+        "--merges-file",
+        default=None,
+        help="If using BPE tokenizer, specify the path to a merges file. Keep None if not using BPE.",
+        type=str,
+    )
+    parser.add_argument(
+        "--tokenizer-library",
+        default="sentencepiece",
+        help="Name of the tokenizer library, such as sentencepiece or megatron",
+        type=str,
+    )
+    parser.add_argument(
+        "--tokenizer-type",
+        default=None,
+        help="Name of the tokenizer type to use, such as GPT2BPETokenizer",
+        type=str,
+    )
+    parser.add_argument(
+        "--dataset-impl",
+        default="mmap",
+        help="Specify how the dataset is stored and will be processed.",
+        type=str,
+    )
     args, other_args = parser.parse_known_args()
 
     workers_per_node = args.workers_per_node  # local world size
     if args.bcp:
-        global_rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
+        global_rank = int(os.environ.get("RANK", 0))
         task_id = global_rank // workers_per_node
         rank = global_rank % workers_per_node
     else:  # on slurm based platforms
@@ -82,12 +112,25 @@
         print(
             f" ****** Task ID {task_id:02d} Rank {rank:02d} starts to preprocess {os.path.basename(split)}..."
         )
-        input_arg = ["--input", split]
-        output_arg = [
-            "--output-prefix",
-            os.path.join(args.output_path, os.path.basename(split)),
+        input_arg = split
+        output_arg = os.path.join(args.output_path, os.path.basename(split))
+
+        flags = [
+            f"--input={split}",
+            f"--output-prefix={output_arg}",
+            f"--dataset-impl={args.dataset_impl}",
+            f"--tokenizer-library={args.tokenizer_library}",
+            f"--tokenizer-type={args.tokenizer_type}",
         ]
-        subprocess.check_call(cmd + input_arg + output_arg + other_args)
+
+        if args.vocab_file and args.merges_file:
+            flags += [
+                f"--vocab={args.vocab_file}",
+                f"--merge-file={args.merges_file}",
+                f"--append-eod",
+            ]
+
+        subprocess.check_call(cmd + flags + other_args)
         print(
             f" ****** Task ID {task_id:02d} Rank {rank:02d} finished preprocessing {os.path.basename(split)}..."
         )
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/mc4_dataprep/preprocess.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/mc4_dataprep/preprocess.py
@@ -58,7 +58,7 @@
 
     workers_per_node = args.workers_per_node  # local world size
     if args.bcp:
-        global_rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
+        global_rank = int(os.environ.get("RANK", 0))
         task_id = global_rank // workers_per_node
         rank = global_rank % workers_per_node
     else:  # on slurm based platforms
diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py
@@ -106,11 +106,16 @@ def main(cfg):
     elif cfg.get("cluster_type") in ["bcp", "k8s"]:
         file_numbers = cfg.get("file_numbers")
         files_list = utils.convert_file_numbers(file_numbers)
-        # Assumes launched via mpirun:
-        #   mpirun -N <nnodes> -npernode 1 ...
-        wrank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
-        wsize = int(os.environ.get("OMPI_COMM_WORLD_SIZE", 0))
-        lrank = int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK", 0))
+        if cfg.get("cluster_type") == "bcp":
+            wrank = int(os.environ.get("RANK", 0))
+            wsize = int(os.environ.get("WORLD_SIZE", 0))
+            lrank = int(os.environ.get("LOCAL_RANK", 0))
+        else:
+            # Assumes launched via mpirun:
+            #   mpirun -N <nnodes> -npernode 1 ...
+            wrank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
+            wsize = int(os.environ.get("OMPI_COMM_WORLD_SIZE", 0))
+            lrank = int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK", 0))
 
         if lrank == 0:
             # Compile once per node. Should be one container instance per node.
diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/download.py b/launcher_scripts/nemo_launcher/collections/eval_harness/download.py
@@ -26,7 +26,7 @@ def parse_args(parser_main):
     # parser = argparse.ArgumentParser()
     parser = parser_main.add_argument_group(title="download-tasks")
     parser.add_argument("--tasks", default="all_tasks")
-    parser.add_argument("--cache_dir", default="")
+    parser.add_argument("--cache-dir", default="")
     # return parser.parse_args()
     return parser_main
 
diff --git a/launcher_scripts/nemo_launcher/core/data_stages.py b/launcher_scripts/nemo_launcher/core/data_stages.py
@@ -152,7 +152,6 @@ def _make_cluster_parameters(
         env_vars[
             "PYTHONPATH"
         ] = f"{self._launcher_scripts_path}:${{PYTHONPATH}}"  # Required by pile download
-        env_vars["NGC_ARRAY_TYPE"] = "MPIJob"  # Required by BCP
         setup = [f"export {k}={v}" for k, v in env_vars.items()]
 
         cluster_parameters = {}
@@ -369,7 +368,6 @@ def _make_private_cluster_parameters(self, cluster: str, sub_stage: str) -> Dict
             return {
                 "nodes": node_array_size,
                 "ntasks_per_node": bcp_preproc_npernode,
-                "bcp_launcher": "'mpirun --allow-run-as-root'",
             }
         return {}
 
@@ -499,7 +497,6 @@ def _make_private_cluster_parameters(self, cluster: str, sub_stage: str) -> Dict
             return {
                 "nodes": node_array_size,
                 "ntasks_per_node": ntasks_per_node,
-                "bcp_launcher": "'mpirun --allow-run-as-root'",
             }
         return {}
 
@@ -588,6 +585,26 @@ def _make_sub_stages(self) -> List[str]:
             sub_stages += ["preprocess"]
         return sub_stages
 
+    def _filter_raw_json_files(self, raw_dataset_files: list) -> List:
+        """
+        Filter the input dataset files to only include json files and derivatives.
+
+        :param list raw_dataset_files: List of the raw dataset files specified in the config
+        :return: a list of only the json files in the dataset.
+        :rtype: list
+        """
+        if isinstance(raw_dataset_files, omegaconf.listconfig.ListConfig):
+            return raw_dataset_files
+
+        filtered_files = []
+
+        for raw_file in os.listdir(raw_dataset_files):
+            # Only select files that end in .jsonl
+            if not Path(raw_file).suffix.lower() in [".json", ".jsonl", "json.gz"]:
+                continue
+            filtered_files.append(os.path.join(raw_dataset_files, raw_file))
+        return filtered_files
+
     def setup_folder_and_data(self) -> None:
         """Setup job/data folders and fine-tuning/prompt-learning dataset"""
         job_path = self.get_job_path()
@@ -602,11 +619,8 @@ def setup_folder_and_data(self) -> None:
         preprocess_worker_mapping = data_cfg.get("preprocess_worker_mapping")
 
         if data_cfg.get("preprocess_data", False):
-            if not isinstance(raw_dataset_files, omegaconf.listconfig.ListConfig):
-                raw_dataset_files = [
-                    os.path.join(raw_dataset_files, raw_file)
-                    for raw_file in os.listdir(raw_dataset_files)
-                ]
+            raw_dataset_files = self._filter_raw_json_files(raw_dataset_files)
+
             # Sort list of files in directory by size
             sorted_files = sorted(raw_dataset_files, key=lambda x: os.stat(x).st_size)
             file_sizes = [os.stat(x).st_size for x in sorted_files]
@@ -674,14 +688,14 @@ def _make_private_cluster_parameters(self, cluster: str, sub_stage: str) -> Dict
             return {
                 "nodes": node_array_size,
                 "ntasks_per_node": ntasks_per_node,
-                "bcp_launcher": "'mpirun --allow-run-as-root'",
             }
         return {}
 
     def _make_sub_stage_command(self, sub_stage: str) -> List[str]:
         """Make a command of the specified sub-stage"""
         data_cfg = self.stage_cfg
         run_cfg = data_cfg.get("run")
+        cluster_type = self.cfg.cluster_type
 
         if sub_stage == "train_tokenizer":
             bpe_save_dir = Path(data_cfg.get("bpe_save_dir"))
@@ -699,14 +713,23 @@ def _make_sub_stage_command(self, sub_stage: str) -> List[str]:
                 output_path=data_cfg.get("preprocessed_dir"),
                 workers_per_node=run_cfg.get("workers_per_node"),
                 worker_mapping_file=data_cfg.get("preprocess_worker_mapping"),
-                tokenizer_library="sentencepiece",
+                tokenizer_library=data_cfg.get("tokenizer_library"),
                 tokenizer_model=data_cfg.get("tokenizer_model"),
+                tokenizer_type=data_cfg.get("tokenizer_type"),
                 dataset_impl="mmap",
                 log_interval="2000",
                 apply_ftfy="store_true",
                 workers=run_cfg.get("cpus_per_node") // run_cfg.get("workers_per_node"),
             )
 
+            if cluster_type == "bcp":
+                args += create_args_list(bcp="store_true")
+
+            if data_cfg.vocab_file and data_cfg.merges_file:
+                args += create_args_list(
+                    vocab_file=data_cfg.vocab_file, merges_file=data_cfg.merges_file
+                )
+
         sub_stage_command = [f"python3 -u {code_path}", *args]
         sub_stage_command = " \\\n  ".join(sub_stage_command)
         return [sub_stage_command]
diff --git a/launcher_scripts/tests/unit_tests/config_tests/test_main_config.py b/launcher_scripts/tests/unit_tests/config_tests/test_main_config.py
@@ -63,7 +63,6 @@ def test_config(self):
           TRANSFORMERS_OFFLINE: 1
           TORCH_NCCL_AVOID_RECORD_STREAMS: 1
           NCCL_NVLS_ENABLE: 0
-          NVTE_APPLY_QK_LAYER_SCALING: 1
         
         # GPU Mapping
         numa_mapping: