Skip to content

Stuck when quantizing GLM4.6 in w4a8 #560

@Kevin-XiongC

Description

@Kevin-XiongC
import modelopt.torch.quantization as mtq
from transformers import AutoModelForCausalLM,AutoTokenizer
from modelopt.torch.utils.dataset_utils import get_dataset_dataloader

from torch.utils.data import DataLoader
import json
import torch
from transformers.tokenization_utils_base import PreTrainedTokenizerBase

def handle_line(line:str):
    line = line.strip()
    if line:
        x=json.loads(line)
        if "prompt" in x:
            return x["prompt"]
        elif "messages" in x:
            return x["messages"][0]["content"]
        else:
            return ""

import random

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, path:str,tokenizer:PreTrainedTokenizerBase,max_length:int=512,device:str="cuda",max_samples:int=None):
        a=[]
        with open(path, "r") as f:
             for line in f.readlines():
                a.append(handle_line(line))
        random.shuffle(a)
        if max_samples is not None:
            a=a[:max_samples]
        self.encodings = tokenizer.batch_encode_plus(a,return_tensors="pt",padding=True,truncation=True,max_length=max_length).to(device)
        
    
    def __len__(self):
        return len(self.encodings["input_ids"])
    
    def __getitem__(self, index):
        item = {
            key: val[index] if torch.is_tensor(val[index]) else torch.tensor(val[index])
            for key, val in self.encodings.items()
        }
        return item
# Setup the model
model = AutoModelForCausalLM.from_pretrained("/models/glm4.6/",trust_remote_code=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("/models/glm4.6/",trust_remote_code=True)
# Simplified example set up a calibration data loader with the desired calib_size
calib_set = CustomDataset("/models/mlops/mlops/test/online_replay/real.json",tokenizer,max_samples=640,device=model.device,max_length=512)
custom_dataset = DataLoader(dataset=calib_set,batch_size=8,shuffle=True)

import copy
CUSTOM_W4A8_AWQ_CFG = copy.deepcopy(mtq.W4A8_AWQ_BETA_CFG)
for i in range(3):
    CUSTOM_W4A8_AWQ_CFG["quant_cfg"][f"*layers.{i}.*"] = {"enable": False}


from tqdm import tqdm
# # Prepare the calibration set and define a forward loop
def forward_loop(model):
    for batch in tqdm(custom_dataset):
        model(batch["input_ids"])

# # PTQ with in-place replacement to quantized modules
model = mtq.quantize(model, CUSTOM_W4A8_AWQ_CFG, forward_loop)

import torch

from modelopt.torch.export import export_hf_checkpoint

with torch.inference_mode():
    export_hf_checkpoint(
        model,  # The quantized model.
        export_dir="/models/output_trt",  # The directory where the exported files will be stored.
    )

I would like to use the scripts mentioned above to convert a bfloat16 GLM-4.6 model into a W4A8 quantized model using production data. However, the process has been running for over 27 hours with no observable GPU utilization, despite occupying GPU memory. The quantization is being performed on an 8×H200 server with 2 TB of host memory.

Image

According to the logs below, the calibration and searching process has been finished. Is it expected? Should I kill it?
Image

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions