From 1db62f41a2e3b89faf26a7a53d5296ace6af7db3 Mon Sep 17 00:00:00 2001 From: Muti Chung Date: Wed, 19 Nov 2025 09:11:21 +0000 Subject: [PATCH] Fix formatting and typos in modifiers. Signed-off-by: Muti Chung --- src/llmcompressor/modifiers/autoround/base.py | 58 ++++++++------- src/llmcompressor/modifiers/awq/base.py | 40 +++++------ .../modifiers/pruning/sparsegpt/base.py | 38 +++++----- .../modifiers/pruning/wanda/base.py | 36 +++++----- .../modifiers/quantization/gptq/base.py | 72 ++++++++++--------- .../quantization/gptq/gptq_quantize.py | 2 +- .../quantization/quantization/base.py | 2 +- .../quantization/quantization/mixin.py | 38 +++++----- .../modifiers/transform/quip/base.py | 23 +++--- .../modifiers/transform/spinquant/base.py | 27 +++---- .../modifiers/transform/spinquant/mappings.py | 2 +- 11 files changed, 179 insertions(+), 159 deletions(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 2480751a9b..881e593eb8 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -62,35 +62,39 @@ class AutoRoundModifier(Modifier, QuantizationMixin): This modifier leverages signed gradient descent (SignSGD) optimizer and block-wise loss to optimize rounding values and weight clipping in a few steps. - | Sample yaml: - | test_stage: - | modifiers: - | AutoRoundModifier: - | iters: 200 - | config_groups: - | group_0: - | targets: - | - "Linear" - | input_activations: null - | output_activations: null - | weights: - | num_bits: 4 - | type: "int" - | symmetric: true - | strategy: group - | group_size: 128 + Sample yaml: + + ```yaml + test_stage: + modifiers: + AutoRoundModifier: + iters: 200 + config_groups: + group_0: + targets: + - "Linear" + input_activations: null + output_activations: null + weights: + num_bits: 4 + type: "int" + symmetric: true + strategy: group + group_size: 128 + ``` Lifecycle: - - on_initialize - - apply config to model - - on_start - - add input capture hooks to decoding layers - - on_sequential_epoch_end - - apply_autoround - - post_autoround_cleanup - - on_finalize - - remove_hooks() - - model.apply(freeze_module_quantization) + + - on_initialize + - apply config to model + - on_start + - add input capture hooks to decoding layers + - on_sequential_epoch_end + - apply_autoround + - post_autoround_cleanup + - on_finalize + - remove_hooks() + - model.apply(freeze_module_quantization) :param config_groups: dictionary specifying quantization schemes to apply to target modules. Modules not matching a scheme target will NOT be quantized. diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 98e53b4e00..dc35a5c02f 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -58,7 +58,6 @@ class AWQModifier(Modifier, QuantizationMixin): balance_layers: ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"] - smooth_layer: "re:.*final_layer_norm" balance_layers: ["re:.*fc1"] - ] ignore: ["lm_head"] config_groups: group_0: @@ -75,25 +74,26 @@ class AWQModifier(Modifier, QuantizationMixin): ``` Lifecycle: - - on_initialize - - resolve mappings - - capture kwargs needed for forward passes into modules - - on_start - - set up activation cache hooks to capture input activations - to balance layers - - on sequential epoch end - - apply smoothing to each smoothing layer - - consume cached activations across all batches - - clear cached activations as they are used - - find best smoothing scale for each smoothing layer - - apply to model weights - - raise error if any unused activations remain - - on_end - - re-run logic of sequential epoch end (in case of basic pipeline) - - set scales and zero points - - remove activation hooks - - on_finalize - - clear resolved mappings and captured activations + + - on_initialize + - resolve mappings + - capture kwargs needed for forward passes into modules + - on_start + - set up activation cache hooks to capture input activations + to balance layers + - on sequential epoch end + - apply smoothing to each smoothing layer + - consume cached activations across all batches + - clear cached activations as they are used + - find best smoothing scale for each smoothing layer + - apply to model weights + - raise error if any unused activations remain + - on_end + - re-run logic of sequential epoch end (in case of basic pipeline) + - set scales and zero points + - remove activation hooks + - on_finalize + - clear resolved mappings and captured activations :param sequential_targets: list of module names to compress in the same calibration pass diff --git a/src/llmcompressor/modifiers/pruning/sparsegpt/base.py b/src/llmcompressor/modifiers/pruning/sparsegpt/base.py index 0845586602..b3739f0d5f 100644 --- a/src/llmcompressor/modifiers/pruning/sparsegpt/base.py +++ b/src/llmcompressor/modifiers/pruning/sparsegpt/base.py @@ -26,24 +26,28 @@ class SparseGPTModifier(SparsityModifierBase): """ Modifier for applying the one-shot SparseGPT algorithm to a model - | Sample yaml: - | test_stage: - | obcq_modifiers: - | SparseGPTModifier: - | sparsity: 0.5 - | mask_structure: "2:4" - | dampening_frac: 0.001 - | block_size: 128 - | targets: ['Linear'] - | ignore: ['re:.*lm_head'] + Sample yaml: + + ```yaml + test_stage: + obcq_modifiers: + SparseGPTModifier: + sparsity: 0.5 + mask_structure: "2:4" + dampening_frac: 0.001 + block_size: 128 + targets: ['Linear'] + ignore: ['re:.*lm_head'] + ``` Lifecycle: - - on_initialize - - register_hook(module, calibrate_module, "forward") - - on_sequential_batch_end - - sparsify_weight - - on_finalize - - remove_hooks() + + - on_initialize + - register_hook(module, calibrate_module, "forward") + - on_sequential_batch_end + - sparsify_weight + - on_finalize + - remove_hooks() :param sparsity: Sparsity to compress model to :param sparsity_profile: Can be set to 'owl' to use Outlier Weighed @@ -92,7 +96,7 @@ def calibrate_module( :param module: module being calibrated :param args: inputs to the module, the first element of which is the - cannonical input + canonical input :param _output: uncompressed module output, unused """ # Assume that the first argument is the input diff --git a/src/llmcompressor/modifiers/pruning/wanda/base.py b/src/llmcompressor/modifiers/pruning/wanda/base.py index 67eb616889..2599ccdd94 100644 --- a/src/llmcompressor/modifiers/pruning/wanda/base.py +++ b/src/llmcompressor/modifiers/pruning/wanda/base.py @@ -26,23 +26,27 @@ class WandaPruningModifier(SparsityModifierBase): Modifier for applying the one-shot WANDA algorithm to a model from the paper: https://arxiv.org/abs/2306.11695 - | Sample yaml: - | test_stage: - | sparsity_modifiers: - | WandaPruningModifier: - | sparsity: 0.5 - | mask_structure: "2:4" + Sample yaml: + + ```yaml + test_stage: + sparsity_modifiers: + WandaPruningModifier: + sparsity: 0.5 + mask_structure: "2:4" + ``` Lifecycle: - - on_initialize - - register_hook(module, calibrate_module, "forward") - - run_sequential / run_basic - - make_empty_row_scalars - - accumulate_row_scalars - - on_sequential_batch_end - - sparsify_weight - - on_finalize - - remove_hooks() + + - on_initialize + - register_hook(module, calibrate_module, "forward") + - run_sequential / run_basic + - make_empty_row_scalars + - accumulate_row_scalars + - on_sequential_batch_end + - sparsify_weight + - on_finalize + - remove_hooks() :param sparsity: Sparsity to compress model to :param sparsity_profile: Can be set to 'owl' to use Outlier Weighed @@ -78,7 +82,7 @@ def calibrate_module( :param module: module being calibrated :param args: inputs to the module, the first element of which is the - cannonical input + canonical input :param _output: uncompressed module output, unused """ # Assume that the first argument is the input diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index 385de9840a..ab23e4fad3 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -36,40 +36,44 @@ class GPTQModifier(Modifier, QuantizationMixin): """ Implements the GPTQ algorithm from https://arxiv.org/abs/2210.17323. This modifier uses activations to calibrate a hessian matrix, which is then used to determine - optimal quantizion values and orderings for the model weights. - - | Sample yaml: - | test_stage: - | obcq_modifiers: - | GPTQModifier: - | block_size: 128 - | dampening_frac: 0.001 - | offload_hessians: False - | actorder: static - | config_groups: - | group_0: - | targets: - | - "Linear" - | input_activations: null - | output_activations: null - | weights: - | num_bits: 8 - | type: "int" - | symmetric: true - | strategy: group - | group_size: 128 + optimal quantization values and orderings for the model weights. + + Sample yaml: + + ```yaml + test_stage: + obcq_modifiers: + GPTQModifier: + block_size: 128 + dampening_frac: 0.001 + offload_hessians: False + actorder: static + config_groups: + group_0: + targets: + - "Linear" + input_activations: null + output_activations: null + weights: + num_bits: 8 + type: "int" + symmetric: true + strategy: group + group_size: 128 + ``` Lifecycle: - - on_initialize - - apply config to model - - on_start - - add activation calibration hooks - - add gptq weight calibration hooks - - on_sequential_epoch_end - - quantize_weight - - on_finalize - - remove_hooks() - - model.apply(freeze_module_quantization) + + - on_initialize + - apply config to model + - on_start + - add activation calibration hooks + - add gptq weight calibration hooks + - on_sequential_epoch_end + - quantize_weight + - on_finalize + - remove_hooks() + - model.apply(freeze_module_quantization) :param sequential_targets: list of layer names to compress during GPTQ, or '__ALL__' to compress every layer in the model @@ -99,7 +103,7 @@ class GPTQModifier(Modifier, QuantizationMixin): the kv_cache_scheme gets converted into a QuantizationScheme that: - targets the `q_proj` and `k_proj` modules of the model. The outputs of those modules are the keys and values that might be cached - - quantizes the outputs of the aformentioned layers, so that + - quantizes the outputs of the aforementioned layers, so that keys and values are compressed before storing them in the cache There is an explicit assumption that the model contains modules with `k_proj` and `v_proj` in their names. If this is not the case @@ -220,7 +224,7 @@ def calibrate_module( :param module: module being calibrated :param args: inputs to the module, the first element of which is the - cannonical input + canonical input :param _output: uncompressed module output, unused """ # Assume that first argument is the input diff --git a/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py b/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py index b621fdb801..af145df2dc 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py +++ b/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py @@ -286,7 +286,7 @@ def _apply_activation_ordering( W: torch.Tensor, H: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ - Permute weight and hessian in order of greatest outupt activations + Permute weight and hessian in order of greatest output activations :param W: weight to permute :param H: hessian used to determine activation ordering diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py index 1330d16adf..c81a8ba75e 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/base.py +++ b/src/llmcompressor/modifiers/quantization/quantization/base.py @@ -37,7 +37,7 @@ class QuantizationModifier(Modifier, QuantizationMixin): the kv_cache_scheme gets converted into a QuantizationScheme that: - targets the `q_proj` and `k_proj` modules of the model. The outputs of those modules are the keys and values that might be cached - - quantizes the outputs of the aformentioned layers, so that + - quantizes the outputs of the aforementioned layers, so that keys and values are compressed before storing them in the cache There is an explicit assumption that the model contains modules with `k_proj` and `v_proj` in their names. If this is not the case diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py index 42264af22e..caf4ae496c 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/mixin.py +++ b/src/llmcompressor/modifiers/quantization/quantization/mixin.py @@ -43,26 +43,28 @@ class QuantizationMixin(HooksMixin): """ - Mixin which enables a Modifier to act as a quantization config, attching observers, + Mixin which enables a Modifier to act as a quantization config, attaching observers, calibration hooks, and compression wrappers to modifiers Lifecycle: - - on_initialize: QuantizationMixin.initialize_quantization - - Attach schemes to modules - - Attach observers to modules - - Disable quantization until calibration starts/finishes - - on_start: QuantizationMixin.start_calibration - - Attach calibration hooks - - Apply calibration status - - Enable quantization during calibration - - on_end: QuantizationMixin.end_calibration - - Remove calibration hooks - - Apply freeze status - - Keep quantization enabled for future steps - NOTE: QuantizationMixin does not update scales and zero-points on its own, - as this is not desired for all Modifiers inheriting from it. Modifier must - explicitly call `update_weight_zp_scale`. - See QuantizationModifier.on_start method for example + + - on_initialize: QuantizationMixin.initialize_quantization + - Attach schemes to modules + - Attach observers to modules + - Disable quantization until calibration starts/finishes + - on_start: QuantizationMixin.start_calibration + - Attach calibration hooks + - Apply calibration status + - Enable quantization during calibration + - on_end: QuantizationMixin.end_calibration + - Remove calibration hooks + - Apply freeze status + - Keep quantization enabled for future steps + + NOTE: QuantizationMixin does not update scales and zero-points on its own, + as this is not desired for all Modifiers inheriting from it. Modifier must + explicitly call `update_weight_zp_scale`. + See QuantizationModifier.on_start method for example :param config_groups: dictionary specifying quantization schemes to apply to target modules. Modules not matching a scheme target will NOT be quantized. @@ -85,7 +87,7 @@ class QuantizationMixin(HooksMixin): the kv_cache_scheme gets converted into a QuantizationScheme that: - targets the `q_proj` and `k_proj` modules of the model. The outputs of those modules are the keys and values that might be cached - - quantizes the outputs of the aformentioned layers, so that + - quantizes the outputs of the aforementioned layers, so that keys and values are compressed before storing them in the cache There is an explicit assumption that the model contains modules with `k_proj` and `v_proj` in their names. If this is not the case diff --git a/src/llmcompressor/modifiers/transform/quip/base.py b/src/llmcompressor/modifiers/transform/quip/base.py index ace8d64fd4..12b2259a3f 100644 --- a/src/llmcompressor/modifiers/transform/quip/base.py +++ b/src/llmcompressor/modifiers/transform/quip/base.py @@ -34,15 +34,16 @@ class QuIPModifier(Modifier): the model weights and two of which remain as online rotations computed at runtime. Lifecycle: - - on_initialize - - as needed, create transform schemes for V (input) and U (output) - - on_start - - apply TransformConfig - - fuse transforms into weights for mergeable transforms - - add hooks for online transforms - - on sequential epoch end - - on_end - - on_finalize + + - on_initialize + - as needed, create transform schemes for V (input) and U (output) + - on_start + - apply TransformConfig + - fuse transforms into weights for mergeable transforms + - add hooks for online transforms + - on sequential epoch end + - on_end + - on_finalize :param rotations: which rotation schemes to apply to the model. Including `"v"` will rotate the input side of weights, and including `"u"` will rotate the output @@ -152,7 +153,7 @@ def _create_v_scheme(self) -> TransformScheme: apply=[ TransformArgs( targets=self.targets, - location="input", # non-mergable + location="input", # non-mergeable ignore=self.ignore, ), TransformArgs( @@ -179,7 +180,7 @@ def _create_u_scheme(self) -> TransformScheme: ), TransformArgs( targets=self.targets, - location="output", # non-mergable + location="output", # non-mergeable inverse=True, ignore=self.ignore, ), diff --git a/src/llmcompressor/modifiers/transform/spinquant/base.py b/src/llmcompressor/modifiers/transform/spinquant/base.py index 8d84e860f2..e18359be4a 100644 --- a/src/llmcompressor/modifiers/transform/spinquant/base.py +++ b/src/llmcompressor/modifiers/transform/spinquant/base.py @@ -37,7 +37,7 @@ class SpinQuantModifier(Modifier, use_enum_values=True): with learned rotations" (https://arxiv.org/abs/2405.16406) Transforms (rotations) are extra layers added to a model which reduce the accuracy - loss induced by quantization. This is achived through "rotating" weights and + loss induced by quantization. This is achieved through "rotating" weights and activations into a space with a smaller dynamic range of values, thus decreasing the range of scales required for quantization. @@ -47,18 +47,19 @@ class SpinQuantModifier(Modifier, use_enum_values=True): rotations, meaning that they require additional computation at runtime. Lifecycle: - - on_initialize - - infer SpinQuantMappings & NormMappings - - as needed, create transform schemes for R1, R2, R3, & R4 - - on_start - - normalize embeddings - - fuse norm layers into subsequent Linear layers - - apply TransformConfig - - fuse transforms into weights for mergeable transforms - - add hooks for online transforms - - on sequential epoch end - - on_end - - on_finalize + + - on_initialize + - infer SpinQuantMappings & NormMappings + - as needed, create transform schemes for R1, R2, R3, & R4 + - on_start + - normalize embeddings + - fuse norm layers into subsequent Linear layers + - apply TransformConfig + - fuse transforms into weights for mergeable transforms + - add hooks for online transforms + - on sequential epoch end + - on_end + - on_finalize :param rotations: A list containing the names of rotations to apply to the model. Possible rotations include R1, R2, R3, and R4 diff --git a/src/llmcompressor/modifiers/transform/spinquant/mappings.py b/src/llmcompressor/modifiers/transform/spinquant/mappings.py index 85c2f0c0f3..da3d76f6c1 100644 --- a/src/llmcompressor/modifiers/transform/spinquant/mappings.py +++ b/src/llmcompressor/modifiers/transform/spinquant/mappings.py @@ -25,7 +25,7 @@ class SpinQuantMapping(BaseModel): :param mlp_in: list of names or regexes for the mlp blocks that receive the input to the MLP block, usually up_proj and gate_proj :param mlp_out: list of names or regexes for the mlp blocks that - consitute the output of the MLP block, usually down_proj + constitute the output of the MLP block, usually down_proj """ embedding: str