ModelTC · hiworldwzj · Jun 20, 2025 · Jun 12, 2025 · Jun 17, 2025 · Jun 17, 2025
diff --git a/lightllm/models/gemma3/layer_infer/pre_layer_infer.py b/lightllm/models/gemma3/layer_infer/pre_layer_infer.py
@@ -42,7 +42,7 @@ def context_forward(self, input_ids, infer_state, layer_weight):
                     continue
                 # pull the img_embeds by uid from shm
                 data = read_shm(get_shm_name_embed(img["uuid"]))
-                img_weight.append(bytes2tensor(data).cuda().reshape(img["token_num"], -1))
+                img_weight.append(bytes2tensor(data, torch_dtype=dtype).cuda().reshape(img["token_num"], -1))
                 img_start_token_ids.append(img["token_id"])
                 img_token_lens.append(img["token_num"])
                 img_start_locs.append(img_start_loc)

diff --git a/lightllm/models/qwen_vl/layer_infer/pre_layer_infer.py b/lightllm/models/qwen_vl/layer_infer/pre_layer_infer.py
@@ -49,7 +49,7 @@ def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_wei
                     continue
                 # pull the img_embeds by uid from shm
                 data = read_shm(get_shm_name_embed(img["uuid"]))
-                img_weight.append(bytes2tensor(data).cuda().reshape(img["token_num"], -1))
+                img_weight.append(bytes2tensor(data, torch_dtype=dtype).cuda().reshape(img["token_num"], -1))
                 img_start_token_ids.append(img["token_id"])
                 img_token_lens.append(img["token_num"])
                 img_start_locs.append(img_start_loc)

diff --git a/lightllm/server/embed_cache/utils.py b/lightllm/server/embed_cache/utils.py
@@ -5,17 +5,19 @@
 
 
 def tensor2bytes(t: torch.Tensor):
-    # t = t.cpu().numpy().tobytes()
-    # return t
-    buf = BytesIO()
-    torch.save(t.detach().cpu(), buf)
-    buf.seek(0)
-    return buf.read()
-
-
-def bytes2tensor(b):
-    # return torch.from_numpy(np.frombuffer(b, dtype=np.float16)).cuda()
-    return torch.load(BytesIO(b))
+    if t.dtype == torch.float32:
+        t = t.cpu().numpy().tobytes()
+    else:
+        t = t.cpu().to(torch.uint16).numpy().tobytes()
-    else:
-        t = t.cpu().to(torch.uint16).numpy().tobytes()
+    elif t.dtype == torch.float16 or t.dtype == torch.bfloat16:
+        t_view = t.cpu().contiguous().view(torch.uint16)
+        t = t_view.numpy().tobytes()
+    else:
+        raise TypeError(f"Unsupported dtype for tensor2bytes: {t.dtype}. Only float32, float16, bfloat16 are explicitly supported.")
-    else:
-        t = t.cpu().to(torch.uint16).numpy().tobytes()
+    elif t.dtype == torch.float16 or t.dtype == torch.bfloat16:
+        t_view = t.cpu().contiguous().view(torch.uint16)
+        t = t_view.numpy().tobytes()
+    else:
+        raise TypeError(f"Unsupported dtype for tensor2bytes: {t.dtype}. Only float32, float16, bfloat16 are explicitly supported.")
+    return t
+
+
+def bytes2tensor(b, torch_dtype=torch.bfloat16):
+    if torch_dtype == torch.float32:
+        arr_loaded = np.frombuffer(b, dtype=np.float32)
+    else:
+        arr_loaded = np.frombuffer(b, dtype=np.uint16)
+    return torch.from_numpy(arr_loaded).to(torch_dtype)
 
 
 def create_shm(name, data):