Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions src/camera-based-e2e/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ def __init__(
self.file = None

with open(indexFile, 'rb') as f:
# NOTE: test does not have reference trajectories
# We train on train and validate on val set
self.indexes = pickle.load(f)

# TODO: Determine how to sample specific subsets of the data that we care about.
Expand All @@ -55,10 +53,10 @@ def decode_img(self, img):
gpu_tensors_list = torchvision.io.decode_jpeg(
img_tensor,
mode=torchvision.io.ImageReadMode.UNCHANGED,
device= 'cpu' #['cuda:0', 'cuda:1'][torch.utils.data.get_worker_info().id%2]
device= 'cuda' #['cuda:0', 'cuda:1'][torch.utils.data.get_worker_info().id%2]
)
# img_array = np.frombuffer(img, np.uint8)
return gpu_tensors_list
return gpu_tensors_list.cpu()

def __len__(self):
return len(self.indexes)
Expand Down Expand Up @@ -103,8 +101,9 @@ def __iter__(self):
from torch.utils.data import DataLoader
import time
from tqdm import tqdm
# NOTE: Replace with your path
DATA_DIR = '/scratch/gilbreth/mgagvani/wod/waymo_open_dataset_end_to_end_camera_v_1_0_0/'
DATA_DIR = '/scratch/gilbreth/shar1159/waymo_open_dataset_end_to_end_camera_v_1_0_0/'
# DATA_DIR = './data'
# DATA_DIR = '/tmp/'
BATCH_SIZE = 32
dataset = WaymoE2E(indexFile="index_train.pkl", data_dir = DATA_DIR, images=True)
loader = DataLoader(
Expand Down
Binary file added src/camera-based-e2e/test_index.pkl
Binary file not shown.
29 changes: 11 additions & 18 deletions src/camera-based-e2e/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,6 @@
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger

from matplotlib import pyplot as plt
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path

from loader import WaymoE2E

# Replace with your model defined in models/
from models.base_model import LitModel, collate_with_images
from models.monocular import MonocularModel, DeepMonocularModel, SAMFeatures
Expand All @@ -31,8 +21,17 @@
train_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_train.pkl', data_dir=args.data_dir, images=True, n_items=250000)
test_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)
Comment on lines 21 to 22

Copilot AI Jan 29, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assignment to 'train_dataset' is unnecessary as it is redefined before this value is used.

Suggested change
train_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_train.pkl', data_dir=args.data_dir, images=True, n_items=250000)
test_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)
WaymoE2E(batch_size=args.batch_size, indexFile='index_train.pkl', data_dir=args.data_dir, images=True, n_items=250000)
WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)

Copilot uses AI. Check for mistakes.

Copilot AI Jan 29, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assignment to 'test_dataset' is unnecessary as it is redefined before this value is used.

Suggested change
test_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)
WaymoE2E(batch_size=args.batch_size, indexFile='index_val.pkl', data_dir=args.data_dir, images=True, n_items=50000)

Copilot uses AI. Check for mistakes.

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=12, collate_fn=collate_with_images, persistent_workers=False, pin_memory=False)
val_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, num_workers=12, collate_fn=collate_with_images, persistent_workers=False, pin_memory=False)
with open("train_index.pkl", "wb") as f1:
pickle.dump(train_index, f1)

with open("test_index.pkl", "wb") as f2:
pickle.dump(test_index, f2)

train_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='train_index.pkl', data_dir=args.data_dir, images=False)
test_dataset = WaymoE2E(batch_size=args.batch_size, indexFile='test_index.pkl', data_dir=args.data_dir, images=False)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=16)
val_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, num_workers=16)

# Model
in_dim = 16 * 6 # Past: (B, 16, 6)
Expand All @@ -59,7 +58,6 @@

trainer.fit(lit_model, train_loader, val_loader)

# Export loss graph to visualizations/
try:
base_path = Path(base_path)
run_dir = sorted((base_path / "logs").glob("camera_e2e_*"))[-1] # newest run
Expand All @@ -79,8 +77,3 @@
except Exception as e:
print(f"Could not save loss plot: {e}")






Binary file added src/camera-based-e2e/train_index.pkl
Binary file not shown.
26 changes: 26 additions & 0 deletions src/camera-based-e2e/train_job.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
#SBATCH --job-name=waymo-e2e
#SBATCH --output=waymo-e2e-%j.out
#SBATCH --error=waymo-e2e-%j.err

#SBATCH --account=csso
#SBATCH --partition=v100
#SBATCH --gres=gpu:1
#SBATCH --time=12:00:00 # 12 hours
#SBATCH --mem=32G
#SBATCH --cpus-per-task=4

# >>> Activate conda environment <<<
module load anaconda/2023.03
source activate /scratch/gilbreth/shar1159/conda_envs/waymo

# >>> Navigate to your project directory <<<
cd /scratch/gilbreth/shar1159/robotvision/src/camera-based-e2e

# >>> Run training <<<
srun python train.py \
--data_dir /scratch/gilbreth/shar1159/waymo_open_dataset_end_to_end_camera_v_1_0_0/ \
--batch_size 16 \
--lr 0.001 \
--max_epochs 10

33 changes: 33 additions & 0 deletions src/camera-based-e2e/waymo-e2e-9940263.err
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Lmod has detected the following error: The following module(s) are unknown: "anaconda/2023.03"

Please check the spelling or version number. Also try "module spider ..."
It is also possible your cache file is out-of-date; it may help to try:
$ module --ignore_cache load "anaconda/2023.03"

Also make sure that all modulefiles written in TCL start with the string #%Module



/var/spool/slurm/job9940263/slurm_script: line 15: activate: No such file or directory
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /scratch/gilbreth/shar1159/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

| Name | Type | Params | Mode | In sizes | Out sizes
-------------------------------------------------------------------
0 | model | BaseModel | 332 K | train | [1, 96] | [1, 40]
-------------------------------------------------------------------
332 K Trainable params
0 Non-trainable params
332 K Total params
1.331 Total estimated model params size (MB)
7 Modules in train mode
0 Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:123: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
slurmstepd: error: *** STEP 9940263.0 ON gilbreth-e000 CANCELLED AT 2025-11-15T15:35:08 ***
slurmstepd: error: *** JOB 9940263 ON gilbreth-e000 CANCELLED AT 2025-11-15T15:35:08 ***
1 change: 1 addition & 0 deletions src/camera-based-e2e/waymo-e2e-9940263.out

Large diffs are not rendered by default.

33 changes: 33 additions & 0 deletions src/camera-based-e2e/waymo-e2e-9940269.err
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Lmod has detected the following error: The following module(s) are unknown: "anaconda/2023.03"

Please check the spelling or version number. Also try "module spider ..."
It is also possible your cache file is out-of-date; it may help to try:
$ module --ignore_cache load "anaconda/2023.03"

Also make sure that all modulefiles written in TCL start with the string #%Module



/var/spool/slurm/job9940269/slurm_script: line 15: activate: No such file or directory
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /scratch/gilbreth/shar1159/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

| Name | Type | Params | Mode | In sizes | Out sizes
-------------------------------------------------------------------
0 | model | BaseModel | 332 K | train | [1, 96] | [1, 40]
-------------------------------------------------------------------
332 K Trainable params
0 Non-trainable params
332 K Total params
1.331 Total estimated model params size (MB)
7 Modules in train mode
0 Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:123: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
slurmstepd: error: *** STEP 9940269.0 ON gilbreth-e000 CANCELLED AT 2025-11-15T16:56:36 ***
slurmstepd: error: *** JOB 9940269 ON gilbreth-e000 CANCELLED AT 2025-11-15T16:56:36 ***
1 change: 1 addition & 0 deletions src/camera-based-e2e/waymo-e2e-9940269.out

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions src/camera-based-e2e/waymo-e2e-9940377.err
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Lmod has detected the following error: The following module(s) are unknown: "anaconda/2023.03"

Please check the spelling or version number. Also try "module spider ..."
It is also possible your cache file is out-of-date; it may help to try:
$ module --ignore_cache load "anaconda/2023.03"

Also make sure that all modulefiles written in TCL start with the string #%Module



/var/spool/slurm/job9940377/slurm_script: line 15: activate: No such file or directory
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/torch/utils/data/dataloader.py:627: UserWarning: This DataLoader will create 16 worker processes in total. Our suggested max number of worker in current system is 4, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
warnings.warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /scratch/gilbreth/shar1159/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

| Name | Type | Params | Mode | In sizes | Out sizes
-------------------------------------------------------------------
0 | model | BaseModel | 332 K | train | [1, 96] | [1, 40]
-------------------------------------------------------------------
332 K Trainable params
0 Non-trainable params
332 K Total params
1.331 Total estimated model params size (MB)
7 Modules in train mode
0 Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:123: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.
`Trainer.fit` stopped: `max_epochs=10` reached.
52,001 changes: 52,001 additions & 0 deletions src/camera-based-e2e/waymo-e2e-9940377.out

Large diffs are not rendered by default.

Loading