feat(gpu): add partial dml support

fumiama · fumiama · commit 31ed623a27a0 · 2025-11-27T17:17:17.000+08:00
diff --git a/ChatTTS/core.py b/ChatTTS/core.py
@@ -141,10 +141,11 @@ def load(
         compile: bool = False,
         custom_path: Optional[FileLike] = None,
         device: Optional[torch.device] = None,
-        coef: Optional[torch.Tensor] = None,
+        coef: Optional[str] = None,
         use_flash_attn=False,
         use_vllm=False,
         experimental: bool = False,
+        enable_cache=False,
     ) -> bool:
         download_path = self.download_models(source, force_redownload, custom_path)
         if download_path is None:
@@ -156,6 +157,7 @@ def load(
             use_flash_attn=use_flash_attn,
             use_vllm=use_vllm,
             experimental=experimental,
+            enable_cache=enable_cache,
             **{
                 k: os.path.join(download_path, v)
                 for k, v in asdict(self.config.path).items()
@@ -287,6 +289,7 @@ def _load(
         use_flash_attn=False,
         use_vllm=False,
         experimental: bool = False,
+        enable_cache = False,
     ):
         if device is None:
             device = select_device(experimental=experimental)
@@ -351,6 +354,7 @@ def _load(
             device=device,
             device_gpt=self.device_gpt,
             logger=self.logger,
+            enable_cache=enable_cache,
         ).eval()
         assert gpt_ckpt_path, "gpt_ckpt_path should not be None"
         gpt.load_pretrained(gpt_ckpt_path, embed_path, experimental=experimental)
@@ -425,6 +429,7 @@ def _infer(
             text_tokens = refined.ids
             text_tokens = [i[i.less(self.tokenizer.break_0_ids)] for i in text_tokens]
             text = self.tokenizer.decode(text_tokens)
+            self.logger.debug("refined texts %s", str(text))
             refined.destroy()
             if refine_text_only:
                 if split_text and isinstance(text, list):
diff --git a/ChatTTS/model/embed.py b/ChatTTS/model/embed.py
@@ -54,24 +54,27 @@ def forward(self, input_ids: torch.Tensor, text_mask: torch.Tensor) -> torch.Ten
         get_emb
         """
         device = next(self.parameters()).device
+        input_ids_dev = input_ids.to(device)
+        text_mask_dev = text_mask.to(device)
+
         emb_text: torch.Tensor = self.emb_text(
-            input_ids[text_mask].narrow(1, 0, 1).squeeze_(1).to(device)
+            input_ids_dev[text_mask_dev].narrow(1, 0, 1).squeeze_(1)
         )
 
-        text_mask_inv = text_mask.logical_not().to(device)
-        masked_input_ids: torch.Tensor = input_ids[text_mask_inv].to(device)
+        text_mask_inv = text_mask_dev.logical_not()
+        masked_input_ids: torch.Tensor = input_ids_dev[text_mask_inv]
 
         emb_code = [
             self.emb_code[i](masked_input_ids[:, i]) for i in range(self.num_vq)
         ]
         emb_code = torch.stack(emb_code, 2).sum(2)
 
         emb = torch.zeros(
-            (input_ids.shape[:-1]) + (emb_text.shape[-1],),
+            (input_ids_dev.shape[:-1]) + (emb_text.shape[-1],),
             device=emb_text.device,
             dtype=emb_text.dtype,
         )
-        emb[text_mask] = emb_text
+        emb[text_mask_dev] = emb_text
         emb[text_mask_inv] = emb_code.to(emb.dtype)
 
         del emb_text, emb_code, text_mask_inv
diff --git a/ChatTTS/model/gpt.py b/ChatTTS/model/gpt.py
@@ -28,6 +28,7 @@ def __init__(
         device=torch.device("cpu"),
         device_gpt=torch.device("cpu"),
         logger=logging.getLogger(__name__),
+        enable_cache=False,
     ):
         super().__init__()
 
@@ -36,6 +37,8 @@ def __init__(
         self.device = device
         self.device_gpt = device_gpt
 
+        self.enable_cache = enable_cache
+
         self.generator = torch.Generator(device=device)
 
         self.num_vq = int(gpt_config["num_vq"])
@@ -142,7 +145,6 @@ def prepare(self, compile=False):
     class _GenerationInputs:
         position_ids: torch.Tensor
         cache_position: torch.Tensor
-        use_cache: bool
         input_ids: Optional[torch.Tensor] = None
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
         attention_mask: Optional[torch.Tensor] = None
@@ -167,7 +169,6 @@ def _prepare_generation_inputs(
         inputs_embeds: Optional[torch.Tensor] = None,
         cache_position: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-        use_cache=True,
     ) -> _GenerationInputs:
         # With static cache, the `past_key_values` is None
         # TODO joao: standardize interface for the different Cache classes and remove of this if
@@ -230,8 +231,7 @@ def _prepare_generation_inputs(
                 and attention_mask is not None
                 and cache_length + input_ids.shape[1] > max_cache_length
             ):
-                start_pos = attention_mask.shape[1] - max_cache_length
-                attention_mask = attention_mask.narrow(1, start_pos, max_cache_length)
+                attention_mask = attention_mask.narrow(1, -max_cache_length, max_cache_length)
 
         if attention_mask is not None and position_ids is None:
             # create position_ids on the fly for batch generation
@@ -258,7 +258,6 @@ def _prepare_generation_inputs(
         model_inputs = self._GenerationInputs(
             position_ids=position_ids,
             cache_position=cache_position,
-            use_cache=use_cache,
         )
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
@@ -399,7 +398,6 @@ def generate(
                 inputs_ids,
                 past_key_values,
                 attention_mask_cache.narrow(1, 0, inputs_ids.shape[1]),
-                use_cache=not self.is_te_llama,
             )
 
             if i > 0:
@@ -423,7 +421,7 @@ def generate(
                 position_ids=model_input.position_ids,
                 past_key_values=model_input.past_key_values,
                 inputs_embeds=model_input.inputs_embeds,
-                use_cache=model_input.use_cache,
+                use_cache=not self.is_te_llama and self.enable_cache,
                 output_attentions=return_attn,
                 cache_position=model_input.cache_position,
             )
diff --git a/ChatTTS/utils/gpu.py b/ChatTTS/utils/gpu.py
@@ -1,3 +1,5 @@
+import importlib.util
+
 import torch
 
 try:
@@ -43,6 +45,10 @@ def select_device(min_memory=2047, experimental=False):
         else:
             logger.get_logger().info("found Apple GPU, but use CPU.")
             device = torch.device("cpu")
+    elif importlib.util.find_spec("torch_directml") is not None:
+        import torch_directml
+
+        device = torch_directml.device(torch_directml.default_device())
     else:
         logger.get_logger().warning("no GPU or NPU found, use CPU instead")
         device = torch.device("cpu")
diff --git a/examples/web/funcs.py b/examples/web/funcs.py
@@ -62,12 +62,12 @@ def on_audio_seed_change(audio_seed_input):
     return rand_spk
 
 
-def load_chat(cust_path: Optional[str], coef: Optional[str]) -> bool:
+def load_chat(cust_path: Optional[str], coef: Optional[str], enable_cache = False) -> bool:
     if cust_path == None:
-        ret = chat.load(coef=coef)
+        ret = chat.load(coef=coef, enable_cache=enable_cache)
     else:
         logger.info("local model path: %s", cust_path)
-        ret = chat.load("custom", custom_path=cust_path, coef=coef)
+        ret = chat.load("custom", custom_path=cust_path, coef=coef, enable_cache=enable_cache)
         global custom_path
         custom_path = cust_path
     if ret:
diff --git a/examples/web/webui.py b/examples/web/webui.py
@@ -261,11 +261,12 @@ def make_audio(autoplay, stream):
     parser.add_argument("--root_path", type=str, help="root path")
     parser.add_argument("--custom_path", type=str, help="custom model path")
     parser.add_argument("--coef", type=str, help="custom dvae coefficient")
+    parser.add_argument("--enable_cache", action="store_true", help="enable model cache")
     args = parser.parse_args()
 
     logger.info("loading ChatTTS model...")
 
-    if load_chat(args.custom_path, args.coef):
+    if load_chat(args.custom_path, args.coef, args.enable_cache):
         logger.info("Models loaded successfully.")
     else:
         logger.error("Models load failed.")