Skip to content

Commit 942f1ca

Browse files
committed
Release v1.7.1 of NNCF to master
1 parent 359dd72 commit 942f1ca

File tree

18 files changed

+349
-65
lines changed

18 files changed

+349
-65
lines changed

ReleaseNotes.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ samples distributed with the code. The samples demonstrate the usage of compres
77
public models and datasets for three different use cases: Image Classification, Object Detection,
88
and Semantic Segmentation.
99

10+
## New in Release 1.7.1:
11+
Bugfixes:
12+
- Fixed a bug with where compressed models that were supposed to return named tuples actually returned regular tuples
13+
- Fixed an issue with batch norm adaptation-enabled compression runs hanging in the DDP scenario
14+
1015
## New in Release 1.7:
1116
- Adjust Padding feature to support accurate execution of U4 on VPU - when setting "target_device" to "VPU", the training-time padding values for quantized convolutions will be adjusted to better reflect VPU inference process.
1217
- Weighted layers that are "frozen" (i.e. have requires_grad set to False at compressed model creation time) are no longer considered for compression, to better handle transfer learning cases.

examples/classification/main.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ def autoq_eval_fn(model, eval_loader):
199199
if config.mode.lower() == 'train':
200200
train(config, compression_ctrl, model, criterion, train_criterion_fn, lr_scheduler, model_name, optimizer,
201201
train_loader, train_sampler, val_loader, best_acc1)
202+
config.mlflow.end_run()
202203

203204

204205
def train(config, compression_ctrl, model, criterion, criterion_fn, lr_scheduler, model_name, optimizer,
@@ -267,9 +268,11 @@ def get_dataset(dataset_config, config, transform, is_train):
267268
if dataset_config == 'imagenet':
268269
prefix = 'train' if is_train else 'val'
269270
return datasets.ImageFolder(osp.join(config.dataset_dir, prefix), transform)
271+
# For testing purposes
270272
if dataset_config == 'mock_32x32':
271-
# For testing purposes
272273
return MockDataset(img_size=(32, 32), transform=transform)
274+
if dataset_config == 'mock_299x299':
275+
return MockDataset(img_size=(299, 299), transform=transform)
273276
return create_cifar(config, dataset_config, is_train, transform)
274277

275278

@@ -287,15 +290,16 @@ def create_cifar(config, dataset_config, is_train, transform):
287290
def create_datasets(config):
288291
dataset_config = config.dataset if config.dataset is not None else 'imagenet'
289292
dataset_config = dataset_config.lower()
290-
assert dataset_config in ['imagenet', 'cifar100', 'cifar10', 'mock_32x32'], "Unknown dataset option"
293+
assert dataset_config in ['imagenet', 'cifar100', 'cifar10', 'mock_32x32', 'mock_299x299'], \
294+
"Unknown dataset option"
291295

292296
if dataset_config == 'imagenet':
293297
normalize = transforms.Normalize(mean=(0.485, 0.456, 0.406),
294298
std=(0.229, 0.224, 0.225))
295299
elif dataset_config == 'cifar100':
296300
normalize = transforms.Normalize(mean=(0.5071, 0.4865, 0.4409),
297301
std=(0.2673, 0.2564, 0.2761))
298-
elif dataset_config in ['cifar10', 'mock_32x32']:
302+
elif dataset_config in ['cifar10', 'mock_32x32', 'mock_299x299']:
299303
normalize = transforms.Normalize(mean=(0.5, 0.5, 0.5),
300304
std=(0.5, 0.5, 0.5))
301305

examples/common/utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@ def safe_call(self, func: str, *args, **kwargs) -> Maybe:
128128
return Maybe.from_value(self._get_mlflow()).bind(
129129
lambda obj: Maybe.from_value(getattr(obj, func)(*args, **kwargs)))
130130

131+
def end_run(self):
132+
self.safe_call('end_run')
133+
131134
def _is_enabled(self):
132135
return self.is_suitable_mode and is_main_process()
133136

nncf/dynamic_graph/context.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -475,11 +475,11 @@ def _init_thread_local(self):
475475
tl.operator_counters = {}
476476
tl.node_call_tracker = {}
477477

478-
def register_node_call(self, node_key: str):
479-
if node_key in self._thread_local.node_call_tracker:
480-
self._thread_local.node_call_tracker[node_key] += 1
478+
def register_node_call(self, node: NNCFNode):
479+
if node.node_id in self._thread_local.node_call_tracker:
480+
self._thread_local.node_call_tracker[node.node_id] += 1
481481
else:
482-
self._thread_local.node_call_tracker[node_key] = 1
482+
self._thread_local.node_call_tracker[node.node_id] = 1
483483

484484
def reset_node_call_counters(self):
485485
for k, _ in self._thread_local.node_call_tracker.items():

nncf/dynamic_graph/wrappers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def wrapped(*args, **kwargs):
7979
node = ctx.maybe_add_node(processed_input, tensor_metas, ia_op_exec_context, module_attrs)
8080

8181
if is_debug():
82-
ctx.register_node_call(ctx.graph.get_node_key_by_id(node.node_id))
82+
ctx.register_node_call(node)
8383

8484
args = tuple(processed_input.op_args)
8585
kwargs = processed_input.op_kwargs

nncf/initialization.py

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from nncf.structures import QuantizationRangeInitArgs
1818
from nncf.utils import is_tensor
1919
from nncf.utils import objwalk
20-
from nncf.utils import training_mode_switcher
20+
from contextlib import contextmanager
2121

2222

2323
class InitializingDataLoader:
@@ -164,39 +164,66 @@ def __init__(self, model, init_device: str, num_bn_forget_steps):
164164
self.num_bn_forget_steps = num_bn_forget_steps
165165
self.momentum_bn_forget = 0.9
166166
self.original_momenta_values = {}
167+
self.original_training_state = {}
167168

168169
@staticmethod
169170
def _apply_to_batchnorms(func):
170171
def func_apply_to_bns(module):
171-
if isinstance(module, torch.nn.modules.batchnorm.BatchNorm2d):
172+
if isinstance(module, (torch.nn.modules.batchnorm.BatchNorm1d,
173+
torch.nn.modules.batchnorm.BatchNorm2d,
174+
torch.nn.modules.batchnorm.BatchNorm3d)):
172175
func(module)
173176

174177
return func_apply_to_bns
175178

176-
def _run_model_inference(self, data_loader, num_init_steps, device):
177-
num_bn_forget_steps = self.num_bn_forget_steps
179+
@contextmanager
180+
def _bn_training_state_switcher(self) -> None:
181+
def save_original_bn_training_state(module: torch.nn.Module):
182+
self.original_training_state[module] = module.training
183+
184+
def set_bn_training_state(module: torch.nn.Module, state: Dict[str, bool]):
185+
module.training = state
186+
187+
def restore_original_bn_training_state(module: torch.nn.Module):
188+
module.training = self.original_training_state[module]
189+
190+
self.model.apply(self._apply_to_batchnorms(save_original_bn_training_state))
191+
self.model.apply(self._apply_to_batchnorms(partial(set_bn_training_state, state=True)))
192+
try:
193+
yield
194+
finally:
195+
self.model.apply(self._apply_to_batchnorms(restore_original_bn_training_state))
178196

197+
@contextmanager
198+
def _bn_momentum_switcher(self) -> None:
179199
def set_bn_momentum(module, momentum_value):
180200
module.momentum = momentum_value
181201

182-
def save_original_bn_momenta(module):
202+
def save_original_bn_momentum(module: torch.nn.Module):
183203
self.original_momenta_values[module] = module.momentum
184204

185-
def restore_original_bn_momenta(module):
205+
def restore_original_bn_momentum(module: torch.nn.Module):
186206
module.momentum = self.original_momenta_values[module]
187207

188-
with training_mode_switcher(self.model, is_training=True):
189-
self.model.apply(self._apply_to_batchnorms(save_original_bn_momenta))
190-
self.model.apply(self._apply_to_batchnorms(partial(set_bn_momentum,
191-
momentum_value=self.momentum_bn_forget)))
208+
self.model.apply(self._apply_to_batchnorms(save_original_bn_momentum))
209+
self.model.apply(self._apply_to_batchnorms(partial(set_bn_momentum,
210+
momentum_value=self.momentum_bn_forget)))
211+
try:
212+
yield
213+
finally:
214+
self.model.apply(self._apply_to_batchnorms(restore_original_bn_momentum))
192215

193-
for i, loaded_item in enumerate(data_loader):
194-
if num_bn_forget_steps is not None and i >= num_bn_forget_steps:
195-
break
196-
args_kwargs_tuple = data_loader.get_inputs(loaded_item)
197-
self._infer_batch(args_kwargs_tuple, device)
216+
def _run_model_inference(self, data_loader, num_init_steps, device):
217+
num_bn_forget_steps = self.num_bn_forget_steps
198218

199-
self.model.apply(self._apply_to_batchnorms(restore_original_bn_momenta))
219+
with self._bn_training_state_switcher():
220+
if num_bn_forget_steps is not None and num_bn_forget_steps > 0:
221+
with self._bn_momentum_switcher():
222+
for i, loaded_item in enumerate(data_loader):
223+
if i >= num_bn_forget_steps:
224+
break
225+
args_kwargs_tuple = data_loader.get_inputs(loaded_item)
226+
self._infer_batch(args_kwargs_tuple, device)
200227

201228
for i, loaded_item in ProgressBar(
202229
enumerate(data_loader),

nncf/model_creation.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from typing import Callable, Any, Tuple, Dict
1515

1616
from torch.nn import Module
17+
from torch.distributed import barrier
1718

1819
from nncf.checkpoint_loading import load_state
1920
from nncf.composite_compression import PTCompositeCompressionAlgorithmBuilder
@@ -24,6 +25,7 @@
2425
from nncf.graph.graph_builder import GraphBuilder
2526
from nncf.nncf_network import NNCFNetwork
2627
from nncf.utils import is_main_process
28+
from nncf.utils import is_dist_avail_and_initialized
2729
from nncf.algo_selector import COMPRESSION_ALGORITHMS
2830

2931
from nncf.common.utils.logger import logger
@@ -141,4 +143,19 @@ def create_compressed_model(model: Module, config: NNCFConfig,
141143
graph = compressed_graph_builder.build_graph(compressed_model, compressed_model.get_tracing_context())
142144
graph.visualize_graph(osp.join(config.get("log_dir", "."), "compressed_graph.dot"))
143145

146+
# Synchronize all processes if run in distributed mode
147+
if is_dist_avail_and_initialized():
148+
try:
149+
barrier()
150+
# Exception can be raised during running barrier
151+
# if the backend not in the supported list https://pytorch.org/docs/stable/distributed.html
152+
except RuntimeError as err:
153+
logger.warning(err)
154+
logger.warning(
155+
"NNCF continues work, while does not guarantee that "
156+
"the processes will finish model's compression at the same time. "
157+
"If your training pipeline demands the processes be synchronized, please, "
158+
"keep attention to that error")
159+
return compression_ctrl, compressed_model
160+
144161
return compression_ctrl, compressed_model

nncf/nncf_network.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -504,10 +504,15 @@ def _set_nncf_wrapped_model(self, value):
504504
def get_clean_shallow_copy(self) -> 'NNCFNetwork':
505505
# WARNING: Will reset pre- and post-ops of the underlying model. Use save_nncf_module_additions
506506
# and load_nncf_module_additions to preserve these, or temporary_clean_view().
507-
return NNCFNetwork(self.get_nncf_wrapped_model(), self.input_infos,
508-
self._user_dummy_forward_fn, self._wrap_inputs_fn,
509-
self.scopes_without_shape_matching, self.ignored_scopes, self.target_scopes,
510-
reset=True)
507+
from nncf.utils import save_module_training_state, load_module_training_state
508+
saved_state = {}
509+
save_module_training_state(self, saved_state)
510+
model_copy = NNCFNetwork(self.get_nncf_wrapped_model(), self.input_infos,
511+
self._user_dummy_forward_fn, self._wrap_inputs_fn,
512+
self.scopes_without_shape_matching, self.ignored_scopes, self.target_scopes,
513+
reset=True)
514+
load_module_training_state(model_copy, saved_state)
515+
return model_copy
511516

512517
def get_modules_in_nncf_modules_by_type(self, types) -> Dict['Scope', nn.Module]:
513518
nncf_modules = self.get_nncf_modules()

nncf/quantization/algo.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -606,7 +606,7 @@ def _get_transformation_layout(self, target_model: NNCFNetwork) -> PTTransformat
606606
target_model.register_compression_module_type(ExtraCompressionModuleType.EXTERNAL_QUANTIZER)
607607
single_config_quantizer_setup = self._get_quantizer_setup(target_model)
608608
minmax_values_for_range_init = {}
609-
if self.should_init:
609+
if is_main_process() and self.should_init:
610610
stats_for_range_init = self._get_statistics_for_final_range_init(target_model,
611611
single_config_quantizer_setup,
612612
self._range_init_params)
@@ -1365,6 +1365,9 @@ def __init__(self):
13651365
self.dump_dir = Path(DEBUG_LOG_DIR) / Path("debug_dumps")
13661366
self.dump_dir.mkdir(parents=True, exist_ok=True)
13671367
self.scale_dump_dir = self.dump_dir / Path("scale")
1368+
if self.scale_dump_dir.exists():
1369+
shutil.rmtree(str(self.scale_dump_dir))
1370+
self.scale_dump_dir.mkdir(parents=True, exist_ok=True)
13681371
self.prop_graph_dump_dir = self.dump_dir / Path("quant_prop")
13691372
if self.prop_graph_dump_dir.exists():
13701373
shutil.rmtree(str(self.prop_graph_dump_dir))
@@ -1383,9 +1386,6 @@ def init_actual(self, owner_model: NNCFNetwork):
13831386
nncf_module_quantizations_id_list)
13841387
self.call_trackers[self.ACTIVATION_QUANTIZERS_TRACKER_NAME].init_with_key_list(
13851388
activation_quantizer_id_list)
1386-
if self.scale_dump_dir.exists():
1387-
shutil.rmtree(str(self.scale_dump_dir))
1388-
self.scale_dump_dir.mkdir(parents=True, exist_ok=True)
13891389
self._strict_forward = True
13901390

13911391
def pre_forward_actions(self, module: 'NNCFNetwork'):
@@ -1428,7 +1428,7 @@ def dump_scale(self, quantizer_scale_params: Dict[str, torch.Tensor], quantizer_
14281428
quantizer_normalized_name = re.sub(r'[^\w\-_\. ]', '_', quantizer_name)
14291429
for scale_param_name, scale_param in quantizer_scale_params.items():
14301430
fname = "{}_{}.txt".format(quantizer_normalized_name, scale_param_name)
1431-
with safe_open(self.scale_dump_dir / fname, "ba") as file:
1431+
with safe_open(self.scale_dump_dir / fname, "ab") as file:
14321432
np.savetxt(file, scale_param.cpu().numpy().flatten())
14331433

14341434
def reset_counters(self):

0 commit comments

Comments
 (0)