EPFLiGHT · MichelDucartier · Jan 26, 2026 · Dec 28, 2025 · Jan 14, 2026 · Jan 15, 2026
diff --git a/docs/source/guides/add_modality.rst b/docs/source/guides/add_modality.rst
@@ -77,7 +77,7 @@ A modality loader should always inherit from :class:`~multimeditron.dataset.load
 
 The :code:`load` function has the following signature:
 
-- Input: A dictionary that contains a key :code:`"value"`, i.e. :code:`{"value" : <something>}`. This is the case for every modality. The actual format of the value field depends on the dataset format. See `TODO`
+- Input: A dictionary that contains a key :code:`"value"`, i.e. :code:`{"value" : <something>}`. This is the case for every modality. The actual format of the value field depends on the dataset format. See :ref:`dataset-format-label`
 - Output returns the raw modality (here a :class:`PIL.Image.Image`).
 
 
@@ -98,7 +98,6 @@ The configuration file configures both the processor and the modality:
         def __init__(
             self,
             hidden_size: int = 4096,
-            max_batch_size: int = 32,
             clip_name: str = "openai/clip-vit-large-patch14",
             projection_type: str = "mlp",
             **kwargs
@@ -116,8 +115,7 @@ The configuration file configures both the processor and the modality:
 
 Every configuration needs to inherit :class:`~multimeditron.model.modalities.base.BaseModalityConfig` and call the :code:`__init__` function from :code:`BaseModalityConfig` wth the arguments:
 
-- :code:`max_batch_size`: the maximum amount of modalities that can be processed in a single batch by the `forward` function of the modality embedder
-- :code:`modality_type`: which modality type does this processor/modality pair handle. This field should match the :code:`"type"` field in the dataset. See `TODO`
+- :code:`modality_type`: which modality type does this processor/modality pair handle. This field should match the :code:`"type"` field in the dataset. See :ref:`dataset-format-label`
 - :code:`hidden_size`: the projected shape of the modality embedder (i.e. the size of a LLM token embedding)
 
 This configuration can be arbitrarily expanded with any JSON-serializable attributes. See `Huggingface custom model`_
@@ -229,4 +227,8 @@ A modality class must implement 3 functions:
 
 Those "freezing" functions are used to train different part of the whole MultiMeditron architecture to ensure training stability.
 
-TODO: Redirect to creating dataset + launching training
+Further reading:
+
+- :any:`Creating a dataset with the right format <dataset-format-label>`
+- :any:`Launching a training <training-label>`
+
diff --git a/docs/source/guides/configuration.rst b/docs/source/guides/configuration.rst
@@ -7,20 +7,31 @@ Configuration Reference
 .. code-block:: yaml
 
     base_llm: # (str) Path to LLM model (can be a local model or a model stored on huggingface)
-    base_model: # (str) Path to trained model. If empty, the LLM model will be initialized to the weights of base_llm, the CLIP are initialized to their default values and projections are initialized randomly
+    base_model: # (str) Path to trained model. If empty, the LLM model will be initialized to the weights of base_llm, the modality embedders are initialized to their default values and projections are initialized randomly
     attachment_token: # (str) Attachment placeholder in the prompts. Default to <|reserved_special_token_0|>
-    tokenizer_type: # (str) The type of tokenizer that should be used, depends on the model (supported values are llama and apertus)
+    tokenizer_type: # (str) The type of tokenizer that should be used, depends on the model (supported values are llama, apertus and qwen3)
     token_size: # (int) Dimension of the embedding of a token for the LLM
+
+    # Truncation settings
     truncation: # (Optional[boolean]) Whether to truncate the input or not, default to false
     max_sequence_length: # (Optional[int]) The maximum sequence length if truncation is enabled
 
+    # Reload from checkpoint
+    resume_from_checkpoint: # (Optional[bool]) Whether to resume training from checkpoint, default to false. If set to true, the training will resume from the checkpoint in base_model
+    wandb_run_id: # (Optional[str]) The wandb run id to resume from if resume_from_checkpoint is true
+
+
     modalities:
         config: # (Dict[str, str]) Configuration passed to the modality
             model_type: # (str) Type of the modality used (e.g. meditron_clip or moe_meditron_clip for instance)
             # The other parameters in config are passed in the modality configuration
 
     training_mode: # (str) Either ALIGNMENT, END2END or FULL. If ALIGNMENT, this will train the projection layer while freezing every other weights. If END2END, this will train the LLM+Projection while freezing every other weights. If FULL, this will train all the model at the same time
 
+    loaders:
+        - loader_type: # (str) Type of the loader. Supported values are: raw-image (for image bytes/PIL images), fs-image (for image paths on the filesystem, not recommended)
+          modality_type: # (str) Type of the modality that this loader corresponds to (e.g. image)
+
     datasets: # List of datasets to use for finetuning. Each dataset must follow the format described in the README.md
       - packed_path: # (str) Path to the 1st dataset
       - packed_path: # (str) Path to the 2nd dataset

diff --git a/docs/source/guides/dataset_format.rst b/docs/source/guides/dataset_format.rst
@@ -1,13 +1,28 @@
+.. _dataset-format-label:
+
 Dataset format
 ==============
 
+.. _dataset-format-modalities:
+.. toctree::
+   :maxdepth: 2
+   :caption: Supported modalities
+   :includehidden:
+
+   modalities/image
+
+
 This section describes the dataset format supported by the MultiMeditron pipeline. The dataset format varies from one modality to another and you can add your own modality by following :any:`this tutorial <add-modality-label>`.
 
 Our training pipeline supports two types of dataset: pretraining and instruction-tuning datasets. Here are the two formats that we support:
 
 1. Arrow/Parquet format (recommended): where the modalities are directly stored in the dataset
 2. JSONL format (not recommended): where the images and modalities are stored on the file system. Those dataset must be processed with :code:`merge_inputs.py`
 
+We support the following modalities, for a detailed format description please refer to the :any:`corresponding documentation<dataset-format-modalities>`.
+
+
+
 Arrow format (recommended) 
 --------------------------
 
@@ -20,13 +35,13 @@ Each dataset must contain a column :code:`text` and a column :code:`modalities`.
 
     "Let's compare the first image: <|reserved_special_token_0|>, and the second 3D image: <|reserved_special_token_0|>"
 
-And the code:`modalities` column must be of the following form:
+And the :code:`modalities` column must be of the following form:
 
 .. code-block:: python
 
     [{"type": "modality_type", "value" : some_modality}]
 
-For instance, for image type, :code:`some_modality` must contains the bytes of the image
+For instance, for image type, :code:`some_modality` must contain a PIL Image object.
 
 Note that we use a special placeholder :code:`<|reserved_special_token_0|>` to indicate the position of the tokens from the modality
 
@@ -45,8 +60,8 @@ It's the same as the pretraining dataset but instead of the :code:`text` column,
         {"role" : "assistant", "content" : "Sed non risus. Suspendisse lectus tortor, dignissim sit amet, adipiscing nec, ultricies sed, dolor."}
     ]
 
-JSONL format
-------------
+JSONL format (deprecated)
+-------------------------
 
 We also support :code:`.jsonl` files where each line corresponds to a sample. We describe how each sample must be formatted:
 

diff --git a/docs/source/guides/guide.rst b/docs/source/guides/guide.rst
@@ -6,6 +6,7 @@ This section contains user guides and tutorials for the MultiMeditron project.
 .. toctree::
    :maxdepth: 2
    :includehidden:
+   :caption: User Guides
 
    quickstart
    add_modality

diff --git a/docs/source/guides/modalities/image.rst b/docs/source/guides/modalities/image.rst
@@ -0,0 +1,84 @@
+.. _dataset-format-image:
+
+Image modality
+==============
+
+Read :ref:`dataset-format-label` for more details about the general dataset format.
+
+This section describes how to format image modalities in the dataset.
+
+PIL Image format
+----------------
+
+Huggingface datasets automatically converts PIL images into bytes when saving to Arrow format. Therefore, you can directly use PIL images when creating your dataset. We provide an example below:
+
+.. code-block:: python
+
+    from PIL import Image
+    import datasets
+
+    def generate_sample(image_path):
+        image = Image.open(image_path).convert("RGB")
+        conversations = [
+            {"role": "user", "content": "Describe the image: <|reserved_special_token_0|>."},
+            {"role": "assistant", "content": "This is an image of ..."}
+        ]
+        return {
+            "conversations": conversations,
+            "modalities": [{"type": "image", "value": image}]
+        }
+
+    dataset = datasets.Dataset.from_generator(
+        lambda: (generate_sample(path) for path in list_of_image_paths)
+    )
+    dataset.save_to_disk("path/to/save/dataset")
+
+When saving the dataset to Arrow format, the images will be automatically converted to bytes with huggingface datasets.
+
+When training the model, write in the configuration file
+
+.. code-block:: yaml
+
+    loaders:
+        - loader_type: "raw-image"
+          modality_type: "image"
+
+    datasets:
+        packed_path: "path/to/save/dataset"
+
+Filesystem format
+-----------------
+
+You can also store the images on the filesystem and provide the paths to the images in the dataset. However, this format is not recommended for training as it is less efficient than storing the images directly in the dataset. We recommend using the PIL/Bytes format instead and to use the filesystem format only for inference.
+
+In the dataset, the modality value must be the path to the image:
+
+.. code-block:: python
+
+    def generate_sample(image_path):
+        conversations = [
+            {"role": "user", "content": "Describe the image: <|reserved_special_token_0|>."},
+            {"role": "assistant", "content": "This is an image of ..."}
+        ]
+        return {
+            "conversations": conversations,
+            "modalities": [{"type": "image", "value": image_path}]
+        }
+
+    dataset = datasets.Dataset.from_generator(
+        lambda: (generate_sample(path) for path in list_of_image_paths)
+    )
+    dataset.save_to_disk("path/to/save/dataset")
+
+When training the model, write in the configuration file
+
+.. code-block:: yaml
+
+    loaders:
+        - loader_type: "fs-image"
+          modality_type: "image"
+
+    datasets:
+        packed_path: "path/to/save/dataset"
+
+
diff --git a/docs/source/guides/quickstart.rst b/docs/source/guides/quickstart.rst
@@ -88,11 +88,7 @@ Once you have installed MultiMeditron, you can run inference on your images. Her
     ATTACHMENT_TOKEN = "<|reserved_special_token_0|>"
 
     # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", dtype=torch.bfloat16)
-    tokenizer.pad_token = tokenizer.eos_token
-    special_tokens = {'additional_special_tokens': [ATTACHMENT_TOKEN]}
-    tokenizer.add_special_tokens(special_tokens)
-    attachment_token_idx = tokenizer.convert_tokens_to_ids(ATTACHMENT_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained("/path/to/trained/model", dtype=torch.bfloat16)
 
     model = MultiModalModelForCausalLM.from_pretrained("path/to/trained/model", device_map="auto")
     model.eval()
@@ -108,14 +104,13 @@ Once you have installed MultiMeditron, you can run inference on your images. Her
     }
 
     loader = FileSystemImageLoader(base_path=os.getcwd())
-
     collator = DataCollatorForMultimodal(
-            tokenizer=tokenizer,
-            tokenizer_type="llama",
-            modality_processors=model.processors(), 
-            modality_loaders={"image" : loader},
-            attachment_token_idx=attachment_token_idx,
-            add_generation_prompt=True
+        tokenizer=tokenizer,
+        attachment_token=ATTACHMENT_TOKEN,
+        chat_template=ChatTemplate.from_name("llama"),
+        modality_processors=model.processors(),
+        modality_loaders={"image" : loader},
+        add_generation_prompt=True,
     )
 
     batch = collator([sample])

diff --git a/docs/source/guides/training.rst b/docs/source/guides/training.rst
@@ -1,6 +1,9 @@
+.. _training-label:
+
 Training a MultiMeditron model 
 ==============================
 
+
 This tutorial provides a step-by-step guide on how to train a model using MultiMeditron. We will walk you through the process with clear examples.
 
 Configuration files
@@ -131,7 +134,7 @@ We provide scripts to launch MultiMeditron training on multi node cluster. We pr
 * TODO: Provide script for Run:ai cluster
 
 SLURM cluster
-""""""""""""""
+"""""""""""""
 
 To launch a training on a SLURM cluster, we can use the following :code:`sbatch` script:
 
@@ -146,11 +149,9 @@ To launch a training on a SLURM cluster, we can use the following :code:`sbatch`
     #SBATCH --gres gpu:4        # Number of GPUs
     #SBATCH --cpus-per-task 288     # number of CPUs per task.
     #SBATCH --time 11:59:59       # maximum execution time (DD-HH:MM:SS)
-    #SBATCH --environment ~/.edf/multimodal.toml
     #SBATCH --export=ALL,SCRATCH=/iopsstor/scratch/cscs/$USER
     #SBATCH -A a127
 
-    export CUDA_LAUNCH_BLOCKING=1
     echo "START TIME: $(date)"
     # auto-fail on any errors in this script
     set -eo pipefail
@@ -192,8 +193,9 @@ To launch a training on a SLURM cluster, we can use the following :code:`sbatch`
       --cpus-per-task $SLURM_CPUS_PER_TASK \
       --jobid $SLURM_JOB_ID \
       --wait 60 \
-      -A a06 \
+      -A a127 \
       --reservation=sai-a127
+      --environment ~/.edf/multimodal.toml
       "
 
     # bash -c is needed for the delayed interpolation of env vars to work

diff --git a/src/multimeditron/model/attention.py b/src/multimeditron/model/attention.py
@@ -98,4 +98,5 @@ def forward(
         out = self.proj(out)
         out = self.proj_drop(out)
 
+
         return out
diff --git a/src/multimeditron/model/model.py b/src/multimeditron/model/model.py
@@ -655,6 +655,7 @@ def bootstrap(config, tokenizer, modalities_config):
     Returns:
         MultiModalModelForCausalLM: The initialized multimodal model.
     """
+
 
     multimodal_config = MultimodalConfig(
         hidden_size=config["token_size"],
Original file line number	Diff line number	Diff line change
Expand Up		@@ -98,4 +98,5 @@ def forward(
		out = self.proj(out)
		out = self.proj_drop(out)


		return out