Skip to content

Commit d6b00f4

Browse files
[Security] Fix: Bad use of null-like value (#1634)
Signed-off-by: Artur Fierka <[email protected]>
1 parent 66858d6 commit d6b00f4

File tree

3 files changed

+15
-17
lines changed

3 files changed

+15
-17
lines changed

vllm/forward_context.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ def set_forward_context(attn_metadata: Any,
156156
dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
157157
cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0)
158158

159+
assert current_platform is not None, "current_platform is None" # noqa
159160
if current_platform.is_hpu(): # noqa
160161
num_experts_per_tok = 0
161162
num_experts_per_tok = getattr(

vllm/model_executor/models/mllama.py

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,22 +1064,19 @@ def _attention_with_mask_hpu(
10641064
# Skip writing kv-cache for the initial profiling run.
10651065
if kv_cache is not None and isinstance(kv_cache, tuple):
10661066
assert self.attn.backend == _Backend.HPU_ATTN
1067-
# During cross-attention decode, key & value will be None,
1068-
# we don't need to cache them.
1069-
if (k is not None) and (v is not None):
1070-
from vllm.attention.ops.hpu_paged_attn import HPUPagedAttention
1071-
key_cache, value_cache = HPUPagedAttention.split_kv_cache(
1072-
kv_cache, self.num_local_key_value_heads, self.head_dim)
1073-
cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
1074-
cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode])
1075-
slot_mapping = torch.cat([
1076-
attn_metadata.cross_slot_mapping[s:e]
1077-
for s, e in kv_range_for_decode
1078-
])
1079-
key_cache = self.attn.impl.k_cache(cached_k, key_cache,
1080-
slot_mapping)
1081-
value_cache = self.attn.impl.v_cache(cached_v, value_cache,
1082-
slot_mapping)
1067+
from vllm.attention.ops.hpu_paged_attn import HPUPagedAttention
1068+
key_cache, value_cache = HPUPagedAttention.split_kv_cache(
1069+
kv_cache, self.num_local_key_value_heads, self.head_dim)
1070+
cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
1071+
cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode])
1072+
slot_mapping = torch.cat([
1073+
attn_metadata.cross_slot_mapping[s:e]
1074+
for s, e in kv_range_for_decode
1075+
])
1076+
key_cache = self.attn.impl.k_cache(cached_k, key_cache,
1077+
slot_mapping)
1078+
value_cache = self.attn.impl.v_cache(cached_v, value_cache,
1079+
slot_mapping)
10831080

10841081
q_len = q.shape[0]
10851082
kv_len = k.shape[0]

vllm/worker/hpu_model_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2467,7 +2467,7 @@ def create_dummy_seq_group_metadata(self,
24672467
sampling_params = None
24682468
else:
24692469
sampling_params = SamplingParams(temperature=temperature)
2470-
num_blocks = math.ceil(seq_len / self.block_size)
2470+
num_blocks = math.ceil(seq_len / self.block_size)
24712471
seq_len = max(seq_len, 1)
24722472
computed_block_nums = None
24732473
if is_prompt and self.model_is_mrope and num_patches:

0 commit comments

Comments
 (0)