Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
76 changes: 76 additions & 0 deletions src/camera-based-e2e/build_hard_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import argparse
import json
import pickle
import os

from protos import e2e_pb2 # same import style you use in loader.py


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", type=str, required=True)
parser.add_argument("--index_in", type=str, default="index_train.pkl")
parser.add_argument("--loss_json", type=str, default="scene_loss.json")
parser.add_argument("--top_k", type=int, default=25000)
parser.add_argument("--index_out", type=str, default="index_train_hard_25k.pkl")
args = parser.parse_args()

# 1) Load loss map and pick top-K names by loss (descending)
with open(args.loss_json, "r") as f:
loss_map = json.load(f) # {name: loss}

# sort by loss descending
top = sorted(loss_map.items(), key=lambda kv: kv[1], reverse=True)[: args.top_k]
top_names_ordered = [name for name, _ in top]
top_set = set(top_names_ordered)

# 2) Load original index tuples
with open(args.index_in, "rb") as f:
idx_list = pickle.load(f)

# 3) Scan index tuples, parse name, keep if in top_set
# We store found tuples in a dict so we can output in loss-sorted order.
found = {}

cur_file = None
fh = None

for (filename, start_byte, byte_length) in idx_list:
if filename != cur_file:
if fh is not None:
fh.close()
fh = open(os.path.join(args.data_dir, filename), "rb")
cur_file = filename

fh.seek(start_byte)
blob = fh.read(byte_length)

frame = e2e_pb2.E2EDFrame()
frame.ParseFromString(blob)

name = frame.frame.context.name
if name in top_set:
found[name] = (filename, start_byte, byte_length)

if fh is not None:
fh.close()

# 4) Emit output list in the SAME order as top_names_ordered (highest loss first)
out_list = [found[n] for n in top_names_ordered if n in found]

# Sanity check
missing = [n for n in top_names_ordered if n not in found]
print(f"Requested top_k={args.top_k}")
print(f"Found tuples: {len(out_list)}")
print(f"Missing names: {len(missing)}")
if missing[:5]:
print("Example missing:", missing[:5])

with open(args.index_out, "wb") as f:
pickle.dump(out_list, f)

print(f"Wrote hard index: {args.index_out}")


if __name__ == "__main__":
main()
24 changes: 24 additions & 0 deletions src/camera-based-e2e/frame_ranking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import json
import torch
import torch.nn as nn
import torch.nn.Functional as F

losses = json.load(open("scene_loss.json"))

def generate_weights(loss_dict, epsilon=0.001, normalize=True):
items = list(loss_dict.items())
items.sort(key=lambda x: (-x[1], x[0]))
n = len(items)
weights = {}
for rank, (frame_id, loss) in enumerate(items, start=1):
w = (n - rank + 1) + epsilon
weights[frame_id] = float(w)
if normalize:
s = sum(weights.values())
weights = {k: v/s for k, v in weights.items()}
return weights

def sample_frame_ids(weight_by_frame, k=1):
frame_ids = list(weight_by_frame.keys())
w = list(weight_by_frame.values())
return random.choices(frame_ids, weights=w, k=k)
24 changes: 24 additions & 0 deletions src/camera-based-e2e/hard_scene_job.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
#SBATCH --job-name=waymo-hard-index
#SBATCH --output=waymo-hard-index-%j.out
#SBATCH --error=waymo-hard-index-%j.err

#SBATCH --account=csso
#SBATCH --partition=v100
#SBATCH --gres=gpu:1
#SBATCH --time=012:00:00
#SBATCH --mem=16G
#SBATCH --cpus-per-task=4

module load anaconda
source activate /scratch/gilbreth/shar1159/conda_envs/waymo

cd /scratch/gilbreth/shar1159/robotvision/src/camera-based-e2e

srun python build_hard_index.py \
--data_dir /scratch/gilbreth/shar1159/waymo_open_dataset_end_to_end_camera_v_1_0_0/ \
--index_in index_train.pkl \
--loss_json scene_loss.json \
--top_k 25000 \
--index_out index_train_hard_25k.pkl

Binary file added src/camera-based-e2e/index_train_hard_25k.pkl
Binary file not shown.
68 changes: 68 additions & 0 deletions src/camera-based-e2e/latent_img_viz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import numpy as np
import os

from loader import WaymoE2E
from models.transfuser.team_code_transfuser import latentTF, latentTF_p2


def load_data(num_samples: int, data_root: str):
dataset = WaymoE2E(
batch_size=1,
indexFile="index_train.pkl",
data_dir=data_root,
images=True,
n_items=num_samples,
)
loader = DataLoader(dataset, batch_size=1, num_workers=4)
return loader

def visualize_waymo_batch(loader, output_dir):
os.makedirs(output_dir, exist_ok=True)
for i, batch in enumerate(loader):
if (i == 0):
continue

# [B, C, H, W]
fl = batch["IMAGES"][1] # Front Left
frnt = batch["IMAGES"][2] # Front
fr = batch["IMAGES"][3] # Front Right

print(f"Front Left shape: {fl.shape}")
print(f"Front shape: {frnt.shape}")
print(f"Front Right shape: {fr.shape}")

fig1, axes1 = plt.subplots(4, 1, figsize=(15, 5))
axes1[0].imshow(fl.squeeze(0).permute(1, 2, 0).cpu().numpy())
axes1[0].set_title("Front Left")
axes1[1].imshow(frnt.squeeze(0).permute(1, 2, 0).cpu().numpy())
axes1[1].set_title("Front")
axes1[2].imshow(fr.squeeze(0).permute(1, 2, 0).cpu().numpy())
axes1[2].set_title("Front Right")
axes1[3].imshow(np.concatenate([fl.squeeze(0).permute(1, 2, 0).cpu().numpy(),
frnt.squeeze(0).permute(1, 2, 0).cpu().numpy(),
fr.squeeze(0).permute(1, 2, 0).cpu().numpy()], dim=1))
axes1[3].set_title("directly concatenated cameras")
for ax in axes1: ax.axis('off')
fig1.savefig("viz_plots/waymo_images.png")

#res_transform() expects (front, front_left, front_right)
ltf1_out = latentTF.res_transform(frnt, fl, fr)
ltf2_out = latentTF_p2.res_transform(frnt, fl, fr)

fig2, axes2 = plt.subplots(2, 1, figsize=(15, 8))
axes2[0].imshow(ltf1_out.squeeze(0).permute(1, 2, 0).detach().cpu().numpy())
axes2[0].set_title("latentTF res_transform()")
axes2[1].imshow(ltf2_out.squeeze(0).permute(1, 2, 0).detach().cpu().numpy())
axes2[1].set_title("latentTF_p2 res_transform()")
for ax in axes2: ax.axis('off')
fig2.savefig("viz_plots/latent_images.png")

fig3, axes3 = plt.subplots(1, 1, figsize=(15, 8))
axes3[0].imshow(torch.cat([fl, frnt, fr], dim=3))
axes3[0].set_title("directly concatenated cameras")


LOADED_DATA = load_data(2, "/scratch/gilbreth/shar1159/waymo_open_dataset_end_to_end_camera_v_1_0_0")
visualize_waymo_batch(LOADED_DATA, "/scratch/gilbreth/shar1159/robotvision/src/camera-based-e2e/viz_plots")
56 changes: 56 additions & 0 deletions src/camera-based-e2e/latent_p3/waymo-e2e-latent-10284875.err
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
You are using a CUDA device ('NVIDIA A10') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:881: Checkpoint directory /scratch/gilbreth/shar1159/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision bf16-mixed is not supported by the model summary. Estimated model size in MB will not be accurate. Using 32 bits instead.

| Name | Type | Params | Mode | FLOPs
--------------------------------------------------------------
0 | backbone | latentTFBackbone | 66.2 M | train | 0
1 | head | Sequential | 169 K | train | 0
--------------------------------------------------------------
66.4 M Trainable params
0 Non-trainable params
66.4 M Total params
265.467 Total estimated model params size (MB)
779 Modules in train mode
0 Modules in eval mode
0 Total Flops
SLURM auto-requeueing enabled. Setting signal handlers.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:123: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 16. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 8. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
`Trainer.fit` stopped: `max_epochs=10` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Traceback (most recent call last):
File "/scratch/gilbreth/shar1159/robotvision/src/camera-based-e2e/train_latent.py", line 141, in <module>
trainer.test(lit_model, dataloaders=test_loader)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 821, in test
return call._call_and_handle_interrupt(
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 49, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 864, in _test_impl
results = self._run(model, ckpt_path=ckpt_path, weights_only=weights_only)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1079, in _run
results = self._run_stage()
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1116, in _run_stage
return self._evaluation_loop.run()
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
return loop_run(self, *args, **kwargs)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 146, in run
self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 441, in _evaluation_step
output = call._call_strategy_hook(trainer, hook_name, *step_args)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 329, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 425, in test_step
return self.lightning_module.test_step(*args, **kwargs)
File "/scratch/gilbreth/shar1159/robotvision/src/camera-based-e2e/models/transfuser/team_code_transfuser/latentTF_p3.py", line 176, in test_step
loss = self.per_example_ade(pred, batch["FUTURE"]).mean()
File "/scratch/gilbreth/shar1159/robotvision/src/camera-based-e2e/models/transfuser/team_code_transfuser/latentTF_p3.py", line 160, in per_example_ade
return torch.norm(pred - gt, dim=-1).mean(dim=1)
RuntimeError: The size of tensor a (20) must match the size of tensor b (0) at non-singleton dimension 1
srun: error: gilbreth-h004: task 0: Exited with exit code 1
3,173 changes: 3,173 additions & 0 deletions src/camera-based-e2e/latent_p3/waymo-e2e-latent-10284875.out

Large diffs are not rendered by default.

72 changes: 72 additions & 0 deletions src/camera-based-e2e/latent_p4/waymo-e2e-latent-10284885.err
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
You are using a CUDA device ('NVIDIA A10') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:881: Checkpoint directory /scratch/gilbreth/shar1159/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision bf16-mixed is not supported by the model summary. Estimated model size in MB will not be accurate. Using 32 bits instead.

| Name | Type | Params | Mode | FLOPs
--------------------------------------------------------------
0 | backbone | latentTFBackbone | 66.2 M | train | 0
1 | head | Sequential | 169 K | train | 0
--------------------------------------------------------------
66.4 M Trainable params
0 Non-trainable params
66.4 M Total params
265.467 Total estimated model params size (MB)
779 Modules in train mode
0 Modules in eval mode
0 Total Flops
SLURM auto-requeueing enabled. Setting signal handlers.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:123: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 16. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 8. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
`Trainer.fit` stopped: `max_epochs=10` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.
Traceback (most recent call last):
File "/scratch/gilbreth/shar1159/robotvision/src/camera-based-e2e/train_latent.py", line 141, in <module>
trainer.test(lit_model, dataloaders=test_loader)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 821, in test
return call._call_and_handle_interrupt(
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 49, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 864, in _test_impl
results = self._run(model, ckpt_path=ckpt_path, weights_only=weights_only)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1079, in _run
results = self._run_stage()
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1116, in _run_stage
return self._evaluation_loop.run()
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
return loop_run(self, *args, **kwargs)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 139, in run
batch, batch_idx, dataloader_idx = next(data_fetcher)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/loops/fetchers.py", line 134, in __next__
batch = super().__next__()
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/loops/fetchers.py", line 61, in __next__
batch = next(self.iterator)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/combined_loader.py", line 341, in __next__
out = next(self._iterator)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/pytorch_lightning/utilities/combined_loader.py", line 142, in __next__
out = next(self.iterators[0])
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 701, in __next__
data = self._next_data()
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1445, in _next_data
return self._process_data(data)
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1491, in _process_data
data.reraise()
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/torch/_utils.py", line 715, in reraise
raise exception
FileNotFoundError: Caught FileNotFoundError in DataLoader worker process 1.
Original Traceback (most recent call last):
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
data = fetcher.fetch(index) # type: ignore[possibly-undefined]
File "/scratch/gilbreth/shar1159/conda_envs/waymo/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 33, in fetch
data.append(next(self.dataset_iter))
File "/scratch/gilbreth/shar1159/robotvision/src/camera-based-e2e/loader.py", line 77, in __iter__
self.file = open(os.path.join(self.data_dir, filename), 'rb')
FileNotFoundError: [Errno 2] No such file or directory: '/scratch/gilbreth/shar1159/waymo_open_dataset_end_to_end_camera_v_1_0_0/test_202504211836-202504220845.tfrecord-00014-of-00266'

srun: error: gilbreth-h004: task 0: Exited with exit code 1
3,173 changes: 3,173 additions & 0 deletions src/camera-based-e2e/latent_p4/waymo-e2e-latent-10284885.out

Large diffs are not rendered by default.

Loading