modelscope · Jintao-Huang · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025
diff --git a/docs/source/BestPractices/GRPO代码训练.md b/docs/source/BestPractices/GRPO代码训练.md
@@ -63,6 +63,7 @@ swift rlhf \
     --train_type lora \
     --torch_dtype bfloat16 \
     --dataset 'open-r1/verifiable-coding-problems-python-10k' \
+    --load_from_cache_file true \
     --max_completion_length 2048 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 2 \
@@ -112,6 +113,7 @@ swift rlhf \
     --train_type lora \
     --torch_dtype bfloat16 \
     --dataset 'open-r1/verifiable-coding-problems-python-10k' \
+    --load_from_cache_file true \
     --max_completion_length 2048 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 2 \

diff --git a/docs/source/BestPractices/GRPO多模态训练.md b/docs/source/BestPractices/GRPO多模态训练.md
@@ -129,6 +129,7 @@ swift rlhf \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'AI-ModelScope/clevr_cogen_a_train' \
+    --load_from_cache_file true \
     --max_completion_length 1024 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 8 \
@@ -203,6 +204,7 @@ swift rlhf \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'AI-ModelScope/GEOQA_R1V_Train_8K' \
+    --load_from_cache_file true \
     --max_completion_length 1024 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 8 \
@@ -270,6 +272,7 @@ swift rlhf \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'lmms-lab/multimodal-open-r1-8k-verified' \
+    --load_from_cache_file true \
     --max_completion_length 1024 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 8 \

diff --git a/docs/source/BestPractices/GRPO完整流程.md b/docs/source/BestPractices/GRPO完整流程.md
@@ -134,6 +134,7 @@ swift rlhf \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'zouxuhong/Countdown-Tasks-3to4#50000' \
+    --load_from_cache_file true \
     --max_length 2048 \
     --max_completion_length 1024 \
     --num_train_epochs 1 \

diff --git a/docs/source/BestPractices/Qwen3最佳实践.md b/docs/source/BestPractices/Qwen3最佳实践.md
@@ -145,6 +145,7 @@ swift sft \
     --train_type lora \
     --dataset 'swift/Qwen3-SFT-Mixin#2000' \
               'swift/self-cognition:qwen3#600' \
+    --load_from_cache_file true \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
@@ -222,6 +223,7 @@ swift sft \
     --model Qwen/Qwen3-8B \
     --train_type full \
     --dataset '<your-dataset>' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --torch_dtype bfloat16 \
     --per_device_train_batch_size 1 \
@@ -292,6 +294,7 @@ swift rlhf \
     --model Qwen/Qwen3-8B \
     --train_type full \
     --dataset 'AI-MO/NuminaMath-TIR#5000' \
+    --load_from_cache_file true \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 2 \
@@ -339,6 +342,7 @@ NODE_RANK=$RANK \
 megatron sft \
     --load Qwen3-30B-A3B-Base-mcore \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --pipeline_model_parallel_size 2 \
     --expert_model_parallel_size 8 \

diff --git a/docs/source/BestPractices/快速训练VL模型.md b/docs/source/BestPractices/快速训练VL模型.md
@@ -114,6 +114,7 @@ swift sft \
     --model_type qwen2_5_vl \
     --train_type full \
     --dataset xxx  \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --torch_dtype bfloat16 \
     --attn_impl flash_attn \
@@ -150,6 +151,7 @@ swift sft \
     --model_type qwen2_5_vl \
     --train_type full \
     --dataset xxx \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --torch_dtype bfloat16 \
     --attn_impl flash_attn \

diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -55,8 +55,8 @@
   - 注意：该参数在"ms-swift<3.6"的默认值为0.01。
 - data_seed: 数据集随机种子，默认为42。
 - 🔥dataset_num_proc: 数据集预处理的进程数，默认为1。
-- 🔥load_from_cache_file: 是否从缓存中加载数据集，默认为True。
-  - 注意：该参数在debug阶段建议设置为False。
+- 🔥load_from_cache_file: 是否从缓存中加载数据集，默认为False。建议在实际运行中设置为True，debug阶段设置为False。
+  - 注意：该参数在"ms-swift<3.9"默认为True。
 - dataset_shuffle: 是否对dataset进行随机操作。默认为True。
   - 注意：CPT/SFT的随机包括两个部分：数据集的随机，由`dataset_shuffle`控制；train_dataloader中的随机，由`train_dataloader_shuffle`控制。
 - val_dataset_shuffle: 是否对val_dataset进行随机操作。默认为False。

diff --git a/docs/source/Megatron-SWIFT/多模态模型.md b/docs/source/Megatron-SWIFT/多模态模型.md
@@ -31,6 +31,7 @@ CUDA_VISIBLE_DEVICES=0,1 \
 megatron sft \
     --load Qwen2.5-VL-7B-Instruct-mcore \
     --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+    --load_from_cache_file true \
     --tensor_model_parallel_size 2 \
     --sequence_parallel true \
     --packing true \
@@ -83,6 +84,7 @@ CUDA_VISIBLE_DEVICES=0,1 \
 megatron sft \
     --load Qwen2.5-VL-7B-Instruct-mcore \
     --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+    --load_from_cache_file true \
     --train_type lora \
     --lora_rank 8 \
     --lora_alpha 32 \
@@ -169,6 +171,7 @@ CUDA_VISIBLE_DEVICES=0,1 \
 megatron sft \
     --load InternVL3_5-30B-A3B-mcore \
     --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+    --load_from_cache_file true \
     --train_type lora \
     --lora_rank 8 \
     --lora_alpha 32 \

diff --git a/docs/source_en/BestPractices/GRPO-Code-Training.md b/docs/source_en/BestPractices/GRPO-Code-Training.md
@@ -67,6 +67,7 @@ swift rlhf \
     --train_type lora \
     --torch_dtype bfloat16 \
     --dataset 'open-r1/verifiable-coding-problems-python-10k' \
+    --load_from_cache_file true \
     --max_completion_length 2048 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 2 \
@@ -115,6 +116,7 @@ swift rlhf \
     --train_type lora \
     --torch_dtype bfloat16 \
     --dataset 'open-r1/verifiable-coding-problems-python-10k' \
+    --load_from_cache_file true \
     --max_completion_length 2048 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 2 \

diff --git a/docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md b/docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md
@@ -140,6 +140,7 @@ swift rlhf \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'AI-ModelScope/clevr_cogen_a_train' \
+    --load_from_cache_file true \
     --max_completion_length 1024 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 8 \
@@ -218,6 +219,7 @@ swift rlhf \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'AI-ModelScope/GEOQA_R1V_Train_8K' \
+    --load_from_cache_file true \
     --max_completion_length 1024 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 8 \
@@ -307,6 +309,7 @@ swift rlhf \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'lmms-lab/multimodal-open-r1-8k-verified' \
+    --load_from_cache_file true \
     --max_completion_length 1024 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 8 \

diff --git a/docs/source_en/BestPractices/GRPO.md b/docs/source_en/BestPractices/GRPO.md
@@ -148,6 +148,7 @@ swift rlhf \
     --train_type full \
     --torch_dtype bfloat16 \
     --dataset 'zouxuhong/Countdown-Tasks-3to4#50000' \
+    --load_from_cache_file true \
     --max_length 2048 \
     --max_completion_length 1024 \
     --num_train_epochs 1 \

diff --git a/docs/source_en/BestPractices/Qwen3-Best-Practice.md b/docs/source_en/BestPractices/Qwen3-Best-Practice.md
@@ -149,6 +149,7 @@ swift sft \
     --train_type lora \
     --dataset 'swift/Qwen3-SFT-Mixin#2000' \
               'swift/self-cognition:qwen3#600' \
+    --load_from_cache_file true \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
@@ -225,6 +226,7 @@ swift sft \
     --model Qwen/Qwen3-8B \
     --train_type full \
     --dataset '<your-dataset>' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --torch_dtype bfloat16 \
     --per_device_train_batch_size 1 \
@@ -296,6 +298,7 @@ swift rlhf \
     --model Qwen/Qwen3-8B \
     --train_type full \
     --dataset 'AI-MO/NuminaMath-TIR#5000' \
+    --load_from_cache_file true \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 2 \
@@ -343,6 +346,7 @@ NODE_RANK=$RANK \
 megatron sft \
     --load Qwen3-30B-A3B-Base-mcore \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --pipeline_model_parallel_size 2 \
     --expert_model_parallel_size 8 \

diff --git a/docs/source_en/BestPractices/Rapidly-Training-VL-model.md b/docs/source_en/BestPractices/Rapidly-Training-VL-model.md
@@ -113,6 +113,7 @@ swift sft \
     --model_type qwen2_5_vl \
     --train_type full \
     --dataset xxx \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --torch_dtype bfloat16 \
     --attn_impl flash_attn \
@@ -150,6 +151,7 @@ swift sft \
     --model_type qwen2_5_vl \
     --train_type full \
     --dataset xxx \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --torch_dtype bfloat16 \
     --attn_impl flash_attn \

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -55,8 +55,8 @@ Hints:
   - Note: For "ms-swift<3.6", the default value of this parameter is 0.01.
 - data_seed: Random seed for the dataset, default is 42.
 - 🔥dataset_num_proc: Number of processes for dataset preprocessing, default is 1.
-- 🔥load_from_cache_file: Whether to load the dataset from the cache, default is True.
-  - Note: It is recommended to set this parameter to False during the debug phase.
+- 🔥load_from_cache_file: Whether to load the dataset from cache. Default is False. Recommended to set to True during actual runs and False during debugging.
+  - Note: This parameter defaults to True in "ms-swift<3.9".
 - dataset_shuffle: Whether to shuffle the dataset. Defaults to True.
   - Note: The shuffling in CPT/SFT consists of two parts: dataset shuffling, controlled by `dataset_shuffle`; and shuffling in the train_dataloader, controlled by `train_dataloader_shuffle`.
 - val_dataset_shuffle: Whether to perform shuffling on the val_dataset. Default is False.

diff --git a/docs/source_en/Megatron-SWIFT/Multimodal-Model.md b/docs/source_en/Megatron-SWIFT/Multimodal-Model.md
@@ -31,6 +31,7 @@ CUDA_VISIBLE_DEVICES=0,1 \
 megatron sft \
     --load Qwen2.5-VL-7B-Instruct-mcore \
     --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+    --load_from_cache_file true \
     --tensor_model_parallel_size 2 \
     --sequence_parallel true \
     --packing true \
@@ -85,6 +86,7 @@ CUDA_VISIBLE_DEVICES=0,1 \
 megatron sft \
     --load Qwen2.5-VL-7B-Instruct-mcore \
     --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+    --load_from_cache_file true \
     --train_type lora \
     --lora_rank 8 \
     --lora_alpha 32 \
@@ -171,6 +173,7 @@ CUDA_VISIBLE_DEVICES=0,1 \
 megatron sft \
     --load InternVL3_5-30B-A3B-mcore \
     --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+    --load_from_cache_file true \
     --train_type lora \
     --lora_rank 8 \
     --lora_alpha 32 \

diff --git a/examples/megatron/base_to_chat.sh b/examples/megatron/base_to_chat.sh
@@ -5,6 +5,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 megatron sft \
     --load Qwen2.5-14B-mcore \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --tensor_model_parallel_size 4 \
     --micro_batch_size 1 \

diff --git a/examples/megatron/benchmark/deepspeed.sh b/examples/megatron/benchmark/deepspeed.sh
@@ -8,6 +8,7 @@ swift sft \
     --model Qwen/Qwen2.5-14B \
     --train_type full \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --torch_dtype bfloat16 \
     --max_steps 2000 \

diff --git a/examples/megatron/dense/72b_offload.sh b/examples/megatron/dense/72b_offload.sh
@@ -6,6 +6,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 megatron sft \
     --load Qwen2.5-72B-Instruct-mcore \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --tensor_model_parallel_size 8 \
     --micro_batch_size 1 \

diff --git a/examples/megatron/dense/qwen3_32b.sh b/examples/megatron/dense/qwen3_32b.sh
@@ -5,6 +5,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 megatron sft \
     --load Qwen3-32B-mcore \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --tensor_model_parallel_size 8 \
     --micro_batch_size 1 \

diff --git a/examples/megatron/fp8.sh b/examples/megatron/fp8.sh
@@ -8,6 +8,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 \
 megatron sft \
     --load Qwen2.5-7B-mcore \
     --dataset 'swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --tensor_model_parallel_size 2 \
     --micro_batch_size 1 \

diff --git a/examples/megatron/long_text.sh b/examples/megatron/long_text.sh
@@ -7,6 +7,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 \
 megatron sft \
     --load Qwen2.5-7B-mcore \
     --dataset 'ZhipuAI/LongWriter-6k' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --tensor_model_parallel_size 4 \
     --micro_batch_size 1 \

diff --git a/examples/megatron/lora/dpo.sh b/examples/megatron/lora/dpo.sh
@@ -6,6 +6,7 @@ megatron rlhf \
     --rlhf_type dpo \
     --load Qwen3-30B-A3B-Instruct-2507-mcore \
     --dataset AI-ModelScope/orpo-dpo-mix-40k \
+    --load_from_cache_file true \
     --train_type lora \
     --lora_rank 8 \
     --lora_alpha 32 \

diff --git a/examples/megatron/lora/glm4_5_106b.sh b/examples/megatron/lora/glm4_5_106b.sh
@@ -6,6 +6,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 \
 megatron sft \
     --load GLM-4.5-Air-mcore \
     --dataset 'swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --train_type lora \
     --lora_rank 32 \
     --lora_alpha 64 \

diff --git a/examples/megatron/lora/loss_scale.sh b/examples/megatron/lora/loss_scale.sh
@@ -6,6 +6,7 @@ megatron sft \
     --load Qwen3-30B-A3B-Base-mcore \
     --train_type lora \
     --dataset AI-ModelScope/function-calling-chatml#10000 \
+    --load_from_cache_file true \
     --loss_scale hermes \
     --agent_template hermes \
     --lora_rank 8 \

diff --git a/examples/megatron/lora/qwen3_235b.sh b/examples/megatron/lora/qwen3_235b.sh
@@ -6,6 +6,7 @@ megatron sft \
     --load Qwen3-235B-A22B-Instruct-2507-mcore \
     --dataset 'swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT#2000' \
               'swift/self-cognition#1000' \
+    --load_from_cache_file true \
     --train_type lora \
     --lora_rank 8 \
     --lora_alpha 32 \

diff --git a/examples/megatron/moe/deepseek_v3.sh b/examples/megatron/moe/deepseek_v3.sh
@@ -7,6 +7,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 megatron sft \
     --load Moonlight-16B-A3B-Instruct-mcore \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --pipeline_model_parallel_size 2 \
     --decoder_last_pipeline_num_layers 13 \

diff --git a/examples/megatron/moe/moe.sh b/examples/megatron/moe/moe.sh
@@ -5,6 +5,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 megatron sft \
     --load Qwen1.5-MoE-A2.7B-mcore \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --pipeline_model_parallel_size 2 \
     --decoder_last_pipeline_num_layers 11 \

diff --git a/examples/megatron/moe/qwen3_moe.sh b/examples/megatron/moe/qwen3_moe.sh
@@ -9,6 +9,7 @@ NODE_RANK=$RANK \
 megatron sft \
     --load Qwen3-30B-A3B-Base-mcore \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --pipeline_model_parallel_size 2 \
     --expert_model_parallel_size 8 \

diff --git a/examples/megatron/moe/qwen3_moe_offload.sh b/examples/megatron/moe/qwen3_moe_offload.sh
@@ -6,6 +6,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 \
 megatron sft \
     --load Qwen3-30B-A3B-Base-mcore \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --expert_model_parallel_size 4 \
     --moe_permute_fusion true \

diff --git a/examples/megatron/multi-node/node1.sh b/examples/megatron/multi-node/node1.sh
@@ -11,6 +11,7 @@ NPROC_PER_NODE=4 \
 megatron sft \
     --load Qwen2.5-14B-mcore \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --tensor_model_parallel_size 4 \
     --micro_batch_size 1 \

diff --git a/examples/megatron/multi-node/node2.sh b/examples/megatron/multi-node/node2.sh
@@ -8,6 +8,7 @@ NPROC_PER_NODE=4 \
 megatron sft \
     --load Qwen2.5-14B-mcore \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
+    --load_from_cache_file true \
     --split_dataset_ratio 0.01 \
     --tensor_model_parallel_size 4 \
     --micro_batch_size 1 \

diff --git a/examples/megatron/multimodal/dense/dpo.sh b/examples/megatron/multimodal/dense/dpo.sh
@@ -7,6 +7,7 @@ megatron rlhf \
     --rlhf_type dpo \
     --load Qwen2.5-VL-7B-Instruct-mcore \
     --dataset 'swift/RLAIF-V-Dataset#20000' \
+    --load_from_cache_file true \
     --train_type full \
     --tensor_model_parallel_size 4 \
     --sequence_parallel true \