Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions fasterai/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,15 +234,21 @@
'fasterai.misc.conv_decomposer._unfold': ( 'misc/conv_decomposer.html#_unfold',
'fasterai/misc/conv_decomposer.py')},
'fasterai.misc.cpu_optimizer': { 'fasterai.misc.cpu_optimizer.accelerate_model_for_cpu': ( 'misc/cpu_optimizer.html#accelerate_model_for_cpu',
'fasterai/misc/cpu_optimizer.py')},
'fasterai/misc/cpu_optimizer.py'),
'fasterai.misc.cpu_optimizer.optimize_for_cpu': ( 'misc/cpu_optimizer.html#optimize_for_cpu',
'fasterai/misc/cpu_optimizer.py')},
'fasterai.misc.fc_decomposer': { 'fasterai.misc.fc_decomposer.FC_Decomposer': ( 'misc/fc_decomposer.html#fc_decomposer',
'fasterai/misc/fc_decomposer.py'),
'fasterai.misc.fc_decomposer.FC_Decomposer.SVD': ( 'misc/fc_decomposer.html#fc_decomposer.svd',
'fasterai/misc/fc_decomposer.py'),
'fasterai.misc.fc_decomposer.FC_Decomposer.__init__': ( 'misc/fc_decomposer.html#fc_decomposer.__init__',
'fasterai/misc/fc_decomposer.py'),
'fasterai.misc.fc_decomposer.FC_Decomposer.decompose': ( 'misc/fc_decomposer.html#fc_decomposer.decompose',
'fasterai/misc/fc_decomposer.py')},
'fasterai/misc/fc_decomposer.py'),
'fasterai.misc.fc_decomposer._rank_from_energy': ( 'misc/fc_decomposer.html#_rank_from_energy',
'fasterai/misc/fc_decomposer.py'),
'fasterai.misc.fc_decomposer._should_decompose': ( 'misc/fc_decomposer.html#_should_decompose',
'fasterai/misc/fc_decomposer.py')},
'fasterai.prune.all': {},
'fasterai.prune.prune_callback': { 'fasterai.prune.prune_callback.PruneCallback': ( 'prune/prune_callback.html#prunecallback',
'fasterai/prune/prune_callback.py'),
Expand Down
3 changes: 2 additions & 1 deletion fasterai/misc/all.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .bn_folding import *
from .fc_decomposer import *
from .conv_decomposer import *
from .conv_decomposer import *
from .cpu_optimizer import *
1 change: 0 additions & 1 deletion fasterai/misc/bn_folding.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
# %% ../../nbs/misc/bn_folding.ipynb #productive-preparation
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy

# %% ../../nbs/misc/bn_folding.ipynb #83000749
Expand Down
61 changes: 40 additions & 21 deletions fasterai/misc/conv_decomposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,28 @@
import copy

# %% ../../nbs/misc/conv_decomposer.ipynb #conv-decomposer
from .fc_decomposer import _rank_from_energy, _should_decompose

def _unfold(tensor, mode):
"Unfold a tensor along a mode into a matrix"
return tensor.moveaxis(mode, 0).flatten(1)

def _partial_tucker(weight, ranks, n_iter=5):
def _partial_tucker(weight, ranks, n_iter=10, tol=1e-4):
"Partial Tucker decomposition on modes [0, 1] via alternating SVD (HOOI)"
# Initialize factors from SVD of mode unfoldings
U0 = torch.linalg.svd(_unfold(weight, 0), full_matrices=False)[0][:, :ranks[0]]
U1 = torch.linalg.svd(_unfold(weight, 1), full_matrices=False)[0][:, :ranks[1]]

for _ in range(n_iter):
# Project out mode 0 using U0, then update U1
U0_prev, U1_prev = U0.clone(), U1.clone()
# Project out mode 0, update U1
proj = torch.einsum('oihw, or -> rihw', weight, U0)
U1 = torch.linalg.svd(_unfold(proj, 1), full_matrices=False)[0][:, :ranks[1]]
# Project out mode 1 using U1, then update U0
# Project out mode 1, update U0
proj = torch.einsum('oihw, is -> oshw', weight, U1)
U0 = torch.linalg.svd(_unfold(proj, 0), full_matrices=False)[0][:, :ranks[0]]
# Early stopping on convergence
if (U0 - U0_prev).norm() + (U1 - U1_prev).norm() < tol: break

# Core = W ×₀ U0ᵀ ×₁ U1ᵀ
core = torch.einsum('oihw, or, is -> rshw', weight, U0, U1)
return core, [U0, U1]

Expand All @@ -38,35 +41,51 @@ class Conv_Decomposer:
def __init__(self): pass

def decompose(self,
model: nn.Module, # The model to decompose
percent_removed: float = 0.5, # Fraction of rank to remove per mode [0, 1)
model: nn.Module, # The model to decompose
percent_removed: float = 0.5, # Fraction of rank to remove per mode [0, 1)
energy_threshold: float | None = None, # Auto rank: keep this fraction of energy (0-1)
layers: list[str] | None = None, # Layer names to decompose (None = all eligible)
exclude: list[str] | None = None, # Layer names to skip
n_iter: int = 10, # Max HOOI iterations
tol: float = 1e-4, # HOOI convergence tolerance
) -> nn.Module:
"Recursively decompose all eligible Conv2d layers in the model"
if not (0 <= percent_removed < 1):
"Decompose eligible Conv2d layers. Use energy_threshold for automatic rank selection."
if energy_threshold is None and not (0 <= percent_removed < 1):
raise ValueError(f"percent_removed must be in range [0, 1), got {percent_removed}")
if energy_threshold is not None and not (0 < energy_threshold <= 1):
raise ValueError(f"energy_threshold must be in range (0, 1], got {energy_threshold}")

new_model = copy.deepcopy(model)
for name in list(new_model._modules):
module = new_model._modules[name]
if len(list(module._modules)) > 0:
new_model._modules[name] = self.decompose(module, percent_removed)
elif isinstance(module, nn.Conv2d) and module.groups == 1 and min(module.kernel_size) > 1:
new_model._modules[name] = self.Tucker(module, percent_removed)
for name, module in list(new_model.named_modules()):
if (isinstance(module, nn.Conv2d) and module.groups == 1
and min(module.kernel_size) > 1
and _should_decompose(name, layers, exclude)):
parent_name, _, child_name = name.rpartition('.')
parent = new_model.get_submodule(parent_name) if parent_name else new_model
setattr(parent, child_name, self.Tucker(module, percent_removed, energy_threshold, n_iter, tol))
return new_model

def Tucker(self,
layer: nn.Conv2d, # The Conv2d layer to decompose
percent_removed: float, # Fraction of rank to remove per mode
layer: nn.Conv2d, # The Conv2d layer to decompose
percent_removed: float = 0.5, # Fraction of rank to remove per mode
energy_threshold: float | None = None, # Auto rank via energy retention
n_iter: int = 10, # Max HOOI iterations
tol: float = 1e-4, # HOOI convergence tolerance
) -> nn.Sequential:
"Perform Tucker decomposition on a single Conv2d layer"
W = layer.weight.data
C_out, C_in = W.shape[:2]

R_out = max(1, int((1 - percent_removed) * C_out))
R_in = max(1, int((1 - percent_removed) * C_in))
if energy_threshold is not None:
S0 = torch.linalg.svd(_unfold(W, 0), full_matrices=False)[1]
S1 = torch.linalg.svd(_unfold(W, 1), full_matrices=False)[1]
R_out = _rank_from_energy(S0, energy_threshold)
R_in = _rank_from_energy(S1, energy_threshold)
else:
R_out = max(1, int((1 - percent_removed) * C_out))
R_in = max(1, int((1 - percent_removed) * C_in))

core, (U_out, U_in) = _partial_tucker(W, [R_out, R_in])
# core: (R_out, R_in, H, W), U_out: (C_out, R_out), U_in: (C_in, R_in)
core, (U_out, U_in) = _partial_tucker(W, [R_out, R_in], n_iter=n_iter, tol=tol)

# 1. Pointwise input compression: (C_in → R_in)
first = nn.Conv2d(C_in, R_in, 1, bias=False)
Expand Down
37 changes: 27 additions & 10 deletions fasterai/misc/cpu_optimizer.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,37 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/misc/cpu_optimizer.ipynb.

# %% auto #0
__all__ = ['accelerate_model_for_cpu']
__all__ = ['optimize_for_cpu', 'accelerate_model_for_cpu']

# %% ../../nbs/misc/cpu_optimizer.ipynb #fbbccd4a
import torch
import torch.nn as nn
from torch.utils.mobile_optimizer import optimize_for_mobile
import warnings

# %% ../../nbs/misc/cpu_optimizer.ipynb #6524ac31
def accelerate_model_for_cpu(model: nn.Module, example_input: torch.Tensor):
model.eval()
example_input = example_input.to(memory_format=torch.channels_last)

model = model.to(memory_format=torch.channels_last)
model = torch.jit.script(model)
model = optimize_for_mobile(model)
def optimize_for_cpu(
model: nn.Module, # The PyTorch model to optimize
sample: torch.Tensor, # Sample input for tracing (with batch dim)
*,
backend: str = "compile", # "compile" (torch.compile) or "trace" (torch.jit.trace)
compile_mode: str = "default", # torch.compile mode
) -> nn.Module:
"Optimize model for CPU inference via channels-last layout + compilation"
model = model.eval().to(memory_format=torch.channels_last)
sample = sample.to(memory_format=torch.channels_last)

if backend == "compile":
return torch.compile(model, mode=compile_mode)
elif backend == "trace":
with torch.no_grad():
return torch.jit.trace(model, sample)
else:
raise ValueError(f"Unknown backend: {backend!r}. Use 'compile' or 'trace'.")

return model
def accelerate_model_for_cpu(model: nn.Module, example_input: torch.Tensor):
"Deprecated: use optimize_for_cpu() instead"
warnings.warn(
"accelerate_model_for_cpu is deprecated, use optimize_for_cpu(model, sample) instead",
DeprecationWarning, stacklevel=2,
)
return optimize_for_cpu(model, example_input, backend="trace")
56 changes: 36 additions & 20 deletions fasterai/misc/fc_decomposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,45 +6,61 @@
# %% ../../nbs/misc/fc_decomposer.ipynb #fbbccd4a
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy

# %% ../../nbs/misc/fc_decomposer.ipynb #6524ac31
def _rank_from_energy(S, threshold):
"Find minimum rank to retain `threshold` fraction of singular value energy"
energy = S.pow(2).cumsum(0) / S.pow(2).sum()
idx = (energy >= threshold).nonzero(as_tuple=True)[0]
return max(1, int(idx[0].item()) + 1) if len(idx) > 0 else S.shape[0]

def _should_decompose(name, layers=None, exclude=None):
"Check if a named layer should be decomposed"
if exclude and name in exclude: return False
if layers is not None: return name in layers
return True

class FC_Decomposer:
"Decompose fully-connected layers using SVD to reduce parameters"

def __init__(self):
pass
def __init__(self): pass

def decompose(self,
model: nn.Module, # The model to decompose
percent_removed: float = 0.5 # Fraction of singular values to remove [0, 1)
model: nn.Module, # The model to decompose
percent_removed: float = 0.5, # Fraction of singular values to remove [0, 1)
energy_threshold: float | None = None, # Auto rank: keep this fraction of energy (0-1)
layers: list[str] | None = None, # Layer names to decompose (None = all)
exclude: list[str] | None = None, # Layer names to skip
) -> nn.Module:
"Recursively decompose all Linear layers in the model using SVD"
if not (0 <= percent_removed < 1):
"Decompose Linear layers using SVD. Use energy_threshold for automatic rank selection."
if energy_threshold is None and not (0 <= percent_removed < 1):
raise ValueError(f"percent_removed must be in range [0, 1), got {percent_removed}")
if energy_threshold is not None and not (0 < energy_threshold <= 1):
raise ValueError(f"energy_threshold must be in range (0, 1], got {energy_threshold}")

new_model = copy.deepcopy(model)
module_names = list(new_model._modules)

for k, name in enumerate(module_names):
if len(list(new_model._modules[name]._modules)) > 0:
new_model._modules[name] = self.decompose(new_model._modules[name], percent_removed)
else:
if isinstance(new_model._modules[name], nn.Linear):
layer = self.SVD(new_model._modules[name], percent_removed)
new_model._modules[name] = layer
for name, module in list(new_model.named_modules()):
if isinstance(module, nn.Linear) and _should_decompose(name, layers, exclude):
parent_name, _, child_name = name.rpartition('.')
parent = new_model.get_submodule(parent_name) if parent_name else new_model
setattr(parent, child_name, self.SVD(module, percent_removed, energy_threshold))
return new_model


def SVD(self,
layer: nn.Linear, # The Linear layer to decompose
percent_removed: float # Fraction of singular values to remove
layer: nn.Linear, # The Linear layer to decompose
percent_removed: float = 0.5, # Fraction of singular values to remove
energy_threshold: float | None = None, # Auto rank via energy retention
) -> nn.Sequential:
"Perform SVD decomposition on a single Linear layer"
W = layer.weight.data
U, S, Vh = torch.linalg.svd(W, full_matrices=False)
L = max(1, int((1.-percent_removed) * S.shape[0]))

if energy_threshold is not None:
L = _rank_from_energy(S, energy_threshold)
else:
L = max(1, int((1.-percent_removed) * S.shape[0]))

W1 = U[:,:L]
W2 = torch.diag(S[:L]) @ Vh[:L]
layer_1 = nn.Linear(in_features=layer.in_features,
Expand Down
2 changes: 2 additions & 0 deletions nbs/_quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ website:
contents:
- misc/bn_folding.ipynb
- misc/fc_decomposer.ipynb
- misc/conv_decomposer.ipynb
- misc/cpu_optimizer.ipynb
- section: Export
contents:
- export/onnx_exporter.ipynb
Expand Down
3 changes: 1 addition & 2 deletions nbs/misc/bn_folding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
"#| export\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"import copy"
]
},
Expand Down Expand Up @@ -388,4 +387,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
Loading
Loading