mgagvani · shreyshrma1 · Nov 25, 2025 · Nov 25, 2025 · Jan 28, 2026 · Copilot
diff --git a/src/camera-based-e2e/loader.py b/src/camera-based-e2e/loader.py
@@ -33,8 +33,6 @@ def __init__(
         self.file = None
 
         with open(indexFile, 'rb') as f:
-            # NOTE: test does not have reference trajectories
-            # We train on train and validate on val set
             self.indexes = pickle.load(f)
 
         # TODO: Determine how to sample specific subsets of the data that we care about.
@@ -55,10 +53,10 @@ def decode_img(self, img):
         gpu_tensors_list = torchvision.io.decode_jpeg(
             img_tensor, 
             mode=torchvision.io.ImageReadMode.UNCHANGED,
-            device= 'cpu' #['cuda:0', 'cuda:1'][torch.utils.data.get_worker_info().id%2]
+            device= 'cuda' #['cuda:0', 'cuda:1'][torch.utils.data.get_worker_info().id%2]
         )
         # img_array = np.frombuffer(img, np.uint8)
-        return gpu_tensors_list
+        return gpu_tensors_list.cpu()
 
     def __len__(self):
         return len(self.indexes)
@@ -103,8 +101,9 @@ def __iter__(self):
     from torch.utils.data import DataLoader
     import time
     from tqdm import tqdm
-    # NOTE: Replace with your path
-    DATA_DIR = '/scratch/gilbreth/mgagvani/wod/waymo_open_dataset_end_to_end_camera_v_1_0_0/'
+    DATA_DIR = '/scratch/gilbreth/shar1159/waymo_open_dataset_end_to_end_camera_v_1_0_0/'
+    # DATA_DIR = './data'
+    # DATA_DIR = '/tmp/'
     BATCH_SIZE = 32
     dataset = WaymoE2E(indexFile="index_train.pkl", data_dir = DATA_DIR, images=True)
     loader = DataLoader(

diff --git a/src/camera-based-e2e/test_index.pkl b/src/camera-based-e2e/test_index.pkl
diff --git a/src/camera-based-e2e/train.py b/src/camera-based-e2e/train.py
@@ -5,16 +5,6 @@
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import CSVLogger
 
-from matplotlib import pyplot as plt
-import pandas as pd
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from pathlib import Path
-
-from loader import WaymoE2E
-
 # Replace with your model defined in models/ 
 from models.base_model import LitModel, collate_with_images
 from models.monocular import MonocularModel, DeepMonocularModel, SAMFeatures
@@ -31,8 +21,17 @@
     train_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_train.pkl', data_dir=args.data_dir, images=True, n_items=250000)
     test_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)
-    train_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_train.pkl', data_dir=args.data_dir, images=True, n_items=250000)
-    test_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)
+    WaymoE2E(batch_size=args.batch_size, indexFile='index_train.pkl', data_dir=args.data_dir, images=True, n_items=250000)
+    WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)
-    test_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)
+    WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)
-    train_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_train.pkl', data_dir=args.data_dir, images=True, n_items=250000)
-    test_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)
+    WaymoE2E(batch_size=args.batch_size, indexFile='index_train.pkl', data_dir=args.data_dir, images=True, n_items=250000)
+    WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)
-    test_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)
+    WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)
 
-    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=12, collate_fn=collate_with_images, persistent_workers=False, pin_memory=False)
-    val_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, num_workers=12, collate_fn=collate_with_images, persistent_workers=False, pin_memory=False)
+    with open("train_index.pkl", "wb") as f1:
+        pickle.dump(train_index, f1)
+
+    with open("test_index.pkl", "wb") as f2:
+        pickle.dump(test_index, f2)
+
+    train_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='train_index.pkl', data_dir=args.data_dir, images=False)
+    test_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='test_index.pkl', data_dir=args.data_dir, images=False)
+
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=16)
+    val_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, num_workers=16)
 
     # Model
     in_dim = 16 * 6  # Past: (B, 16, 6)
@@ -59,7 +58,6 @@
 
     trainer.fit(lit_model, train_loader, val_loader)
 
-    # Export loss graph to visualizations/
     try:
         base_path = Path(base_path)
         run_dir = sorted((base_path / "logs").glob("camera_e2e_*"))[-1]  # newest run
@@ -79,8 +77,3 @@
     except Exception as e:
         print(f"Could not save loss plot: {e}")
 
-
-
-
-
-
diff --git a/src/camera-based-e2e/train_index.pkl b/src/camera-based-e2e/train_index.pkl
diff --git a/src/camera-based-e2e/train_job.sbatch b/src/camera-based-e2e/train_job.sbatch
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBATCH --job-name=waymo-e2e
+#SBATCH --output=waymo-e2e-%j.out
+#SBATCH --error=waymo-e2e-%j.err
+
+#SBATCH --account=csso
+#SBATCH --partition=v100
+#SBATCH --gres=gpu:1
+#SBATCH --time=12:00:00           # 12 hours
+#SBATCH --mem=32G
+#SBATCH --cpus-per-task=4
+
+# >>> Activate conda environment <<<
+module load anaconda/2023.03
+source activate /scratch/gilbreth/shar1159/conda_envs/waymo
+
+# >>> Navigate to your project directory <<<
+cd /scratch/gilbreth/shar1159/robotvision/src/camera-based-e2e
+
+# >>> Run training <<<
+srun python train.py \
+    --data_dir /scratch/gilbreth/shar1159/waymo_open_dataset_end_to_end_camera_v_1_0_0/ \
+    --batch_size 16 \
+    --lr 0.001 \
+    --max_epochs 10
+
diff --git a/src/camera-based-e2e/waymo-e2e-9940263.err b/src/camera-based-e2e/waymo-e2e-9940263.err
@@ -0,0 +1,33 @@
+Lmod has detected the following error: The following module(s) are unknown: "anaconda/2023.03"
+
+Please check the spelling or version number. Also try "module spider ..."
+It is also possible your cache file is out-of-date; it may help to try:
+  $ module --ignore_cache load "anaconda/2023.03"
+
+Also make sure that all modulefiles written in TCL start with the string #%Module
+
+
+
+/var/spool/slurm/job9940263/slurm_script: line 15: activate: No such file or directory
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /scratch/gilbreth/shar1159/checkpoints exists and is not empty.
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
+
+  | Name  | Type      | Params | Mode  | In sizes | Out sizes
+-------------------------------------------------------------------
+0 | model | BaseModel | 332 K  | train | [1, 96]  | [1, 40]  
+-------------------------------------------------------------------
+332 K     Trainable params
+0         Non-trainable params
+332 K     Total params
+1.331     Total estimated model params size (MB)
+7         Modules in train mode
+0         Modules in eval mode
+SLURM auto-requeueing enabled. Setting signal handlers.
+/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
+/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:123: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.
+/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
+srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
+slurmstepd: error: *** STEP 9940263.0 ON gilbreth-e000 CANCELLED AT 2025-11-15T15:35:08 ***
+slurmstepd: error: *** JOB 9940263 ON gilbreth-e000 CANCELLED AT 2025-11-15T15:35:08 ***
diff --git a/src/camera-based-e2e/waymo-e2e-9940263.out b/src/camera-based-e2e/waymo-e2e-9940263.out
diff --git a/src/camera-based-e2e/waymo-e2e-9940269.err b/src/camera-based-e2e/waymo-e2e-9940269.err
@@ -0,0 +1,33 @@
+Lmod has detected the following error: The following module(s) are unknown: "anaconda/2023.03"
+
+Please check the spelling or version number. Also try "module spider ..."
+It is also possible your cache file is out-of-date; it may help to try:
+  $ module --ignore_cache load "anaconda/2023.03"
+
+Also make sure that all modulefiles written in TCL start with the string #%Module
+
+
+
+/var/spool/slurm/job9940269/slurm_script: line 15: activate: No such file or directory
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /scratch/gilbreth/shar1159/checkpoints exists and is not empty.
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
+
+  | Name  | Type      | Params | Mode  | In sizes | Out sizes
+-------------------------------------------------------------------
+0 | model | BaseModel | 332 K  | train | [1, 96]  | [1, 40]  
+-------------------------------------------------------------------
+332 K     Trainable params
+0         Non-trainable params
+332 K     Total params
+1.331     Total estimated model params size (MB)
+7         Modules in train mode
+0         Modules in eval mode
+SLURM auto-requeueing enabled. Setting signal handlers.
+/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
+/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:123: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.
+/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
+srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
+slurmstepd: error: *** STEP 9940269.0 ON gilbreth-e000 CANCELLED AT 2025-11-15T16:56:36 ***
+slurmstepd: error: *** JOB 9940269 ON gilbreth-e000 CANCELLED AT 2025-11-15T16:56:36 ***
diff --git a/src/camera-based-e2e/waymo-e2e-9940269.out b/src/camera-based-e2e/waymo-e2e-9940269.out
diff --git a/src/camera-based-e2e/waymo-e2e-9940377.err b/src/camera-based-e2e/waymo-e2e-9940377.err
@@ -0,0 +1,31 @@
+Lmod has detected the following error: The following module(s) are unknown: "anaconda/2023.03"
+
+Please check the spelling or version number. Also try "module spider ..."
+It is also possible your cache file is out-of-date; it may help to try:
+  $ module --ignore_cache load "anaconda/2023.03"
+
+Also make sure that all modulefiles written in TCL start with the string #%Module
+
+
+
+/var/spool/slurm/job9940377/slurm_script: line 15: activate: No such file or directory
+/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/torch/utils/data/dataloader.py:627: UserWarning: This DataLoader will create 16 worker processes in total. Our suggested max number of worker in current system is 4, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
+  warnings.warn(
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /scratch/gilbreth/shar1159/checkpoints exists and is not empty.
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
+
+  | Name  | Type      | Params | Mode  | In sizes | Out sizes
+-------------------------------------------------------------------
+0 | model | BaseModel | 332 K  | train | [1, 96]  | [1, 40]  
+-------------------------------------------------------------------
+332 K     Trainable params
+0         Non-trainable params
+332 K     Total params
+1.331     Total estimated model params size (MB)
+7         Modules in train mode
+0         Modules in eval mode
+SLURM auto-requeueing enabled. Setting signal handlers.
+/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:123: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.
+`Trainer.fit` stopped: `max_epochs=10` reached.
diff --git a/src/camera-based-e2e/waymo-e2e-9940377.out b/src/camera-based-e2e/waymo-e2e-9940377.out