Stuck when quantizing GLM4.6 in w4a8

``` python
import modelopt.torch.quantization as mtq
from transformers import AutoModelForCausalLM,AutoTokenizer
from modelopt.torch.utils.dataset_utils import get_dataset_dataloader

from torch.utils.data import DataLoader
import json
import torch
from transformers.tokenization_utils_base import PreTrainedTokenizerBase

def handle_line(line:str):
    line = line.strip()
    if line:
        x=json.loads(line)
        if "prompt" in x:
            return x["prompt"]
        elif "messages" in x:
            return x["messages"][0]["content"]
        else:
            return ""

import random

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, path:str,tokenizer:PreTrainedTokenizerBase,max_length:int=512,device:str="cuda",max_samples:int=None):
        a=[]
        with open(path, "r") as f:
             for line in f.readlines():
                a.append(handle_line(line))
        random.shuffle(a)
        if max_samples is not None:
            a=a[:max_samples]
        self.encodings = tokenizer.batch_encode_plus(a,return_tensors="pt",padding=True,truncation=True,max_length=max_length).to(device)
        
    
    def __len__(self):
        return len(self.encodings["input_ids"])
    
    def __getitem__(self, index):
        item = {
            key: val[index] if torch.is_tensor(val[index]) else torch.tensor(val[index])
            for key, val in self.encodings.items()
        }
        return item
# Setup the model
model = AutoModelForCausalLM.from_pretrained("/models/glm4.6/",trust_remote_code=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("/models/glm4.6/",trust_remote_code=True)
# Simplified example set up a calibration data loader with the desired calib_size
calib_set = CustomDataset("/models/mlops/mlops/test/online_replay/real.json",tokenizer,max_samples=640,device=model.device,max_length=512)
custom_dataset = DataLoader(dataset=calib_set,batch_size=8,shuffle=True)

import copy
CUSTOM_W4A8_AWQ_CFG = copy.deepcopy(mtq.W4A8_AWQ_BETA_CFG)
for i in range(3):
    CUSTOM_W4A8_AWQ_CFG["quant_cfg"][f"*layers.{i}.*"] = {"enable": False}


from tqdm import tqdm
# # Prepare the calibration set and define a forward loop
def forward_loop(model):
    for batch in tqdm(custom_dataset):
        model(batch["input_ids"])

# # PTQ with in-place replacement to quantized modules
model = mtq.quantize(model, CUSTOM_W4A8_AWQ_CFG, forward_loop)

import torch

from modelopt.torch.export import export_hf_checkpoint

with torch.inference_mode():
    export_hf_checkpoint(
        model,  # The quantized model.
        export_dir="/models/output_trt",  # The directory where the exported files will be stored.
    )

```

I would like to use the scripts mentioned above to convert a bfloat16 GLM-4.6 model into a W4A8 quantized model using production data. However, the process has been running for over 27 hours with no observable GPU utilization, despite occupying GPU memory. The quantization is being performed on an 8×H200 server with 2 TB of host memory.

<img width="1846" height="1026" alt="Image" src="https://github.com/user-attachments/assets/9a8efcc0-0165-4d0d-91f0-0c556bba361a" />

According to the logs below, the calibration and searching process has been finished. Is it expected? Should I kill it?
<img width="1725" height="766" alt="Image" src="https://github.com/user-attachments/assets/f9960ef0-4ad7-41f3-969c-5a4157783c6c" />

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Stuck when quantizing GLM4.6 in w4a8 #560

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Stuck when quantizing GLM4.6 in w4a8 #560

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions