Synchronize entire embeddings calculation phase (#1967)

mzegla · dtrawins · web-flow · commit e5a8bb61df64 · 2025-03-28T14:59:41.000+01:00
Due to extensive usage of class member fields effectively holding the
state between calls or even phases of processing in the single call
current synchronization in embeddings calculation logic is insufficient
leading to inaccurate results when `add_request` is called in parallel
from multiple threads.

---------

Co-authored-by: Dariusz Trawinski &lt;Dariusz.Trawinski@intel.com&gt;
diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp
@@ -550,6 +550,7 @@ class BlockManager {
      * @return A vector of blocks (one for each layer) occupied by this sequence for this layer.
      */
     const std::vector<KVCacheBlock::Ptr>& get_block_table(uint64_t seq_id, size_t layer_idx) {
+        std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
         OPENVINO_ASSERT(m_block_table.count(seq_id) == 1);
         return m_block_table[seq_id][layer_idx];
     }
@@ -570,6 +571,7 @@ class BlockManager {
      * @return Number of blocks freed in each sequence in the group.
      */
     const size_t free_group_partially(SequenceGroup::Ptr sequence_group, size_t num_required_blocks) {
+        std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
         size_t blocks_num = std::ceil(num_required_blocks / sequence_group->get_not_finished_sequences().size());
         auto not_finished_sequences = sequence_group->get_not_finished_sequences();
         for (size_t idx = 0; idx < not_finished_sequences.size(); ++idx) {
@@ -613,6 +615,7 @@ class BlockManager {
     }
 
     const size_t free_partially_beam_search_group(SequenceGroup::Ptr sequence_group, size_t num_required_blocks) {
+        std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
         size_t physical_blocks_released = 0;
         size_t logical_blocks_released = 0;
         while (num_required_blocks > physical_blocks_released) {
@@ -632,6 +635,7 @@ class BlockManager {
      * @return The number of distinct physical blocks occupied by this sequence group.
      */
     const size_t get_number_of_blocks_occupied_by_sequence(SequenceGroup::Ptr sequence_group) {
+        std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
         auto running_sequences = sequence_group->get_not_finished_sequences();
         std::set<size_t> indices;
         for (size_t idx = 0; idx < running_sequences.size(); ++idx) {
@@ -652,6 +656,7 @@ class BlockManager {
      * @return Whether or not this BlockManager is managing this sequence group.
      */
     const bool has_block_table(uint64_t seq_id) {
+        std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
         return m_block_table.count(seq_id) > 0;
     }
 
@@ -766,6 +771,7 @@ class BlockManager {
      * other sequences tracked by this BlockManager.
      */
     void fork_sequence(uint64_t parent_id, uint64_t child_id) {
+        std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
         OPENVINO_ASSERT(m_block_table.count(child_id) == 0);
         m_block_table[child_id].resize(m_num_layers);
         for (size_t layer_idx = 0; layer_idx < m_num_layers; layer_idx++) {
@@ -782,6 +788,7 @@ class BlockManager {
      * @param seq_id Identifier of the sequence to free.
      */
     void free_sequence(size_t seq_id) {
+        std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
         OPENVINO_ASSERT(m_block_table.find(seq_id) != m_block_table.end(), "sequence with id ", seq_id,
                         " not found in BlockManager, but requested to free");
         auto& block_table = m_block_table[seq_id];
@@ -846,6 +853,7 @@ class BlockManager {
      * @param logical_block_index_sets_to_free Sets (one for each layer) of logical block indices to be freed from this sequence.
      */
     void free_blocks_from_sequence(size_t seq_id, const std::vector<std::set<size_t>>& logical_block_index_sets_to_free) {
+        std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
         std::vector<std::vector<size_t>> logical_block_indices_to_free(logical_block_index_sets_to_free.size());
         for (size_t i = 0; i < logical_block_index_sets_to_free.size(); i++) {
             const auto& index_set = logical_block_index_sets_to_free[i];
@@ -916,6 +924,7 @@ class BlockManager {
      * allocated ones.
      */
     size_t required_blocks_count(SequenceGroup::CPtr seq_group) {
+        std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
         std::vector<Sequence::CPtr> running_sequences = seq_group->get_running_sequences();
         size_t blocks_count = 0; // total number of needed blocks for sequence group
         std::set<size_t> last_block_ids; // unique last block indices
@@ -973,6 +982,7 @@ class BlockManager {
      * @param seq_group Pointer to a sequence group.
      */
     void free_empty_physical_blocks(SequenceGroup::Ptr seq_group) {
+        std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
         size_t num_logical_blocks = seq_group->get_num_logical_blocks();
         if (num_logical_blocks == 0) {
             return;
@@ -997,6 +1007,7 @@ class BlockManager {
      * indices into which the source block contents should be copied into separately.
      */
     std::map<size_t, std::list<size_t>> append_slots(SequenceGroup::Ptr seq_group) {
+        std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
         // Will always allocate the identical number of new blocks (if any) to each of the "layers" to keep the
         // number of blocks occupied by each "layer" identical at all times.
         size_t num_logical_blocks = seq_group->get_num_logical_blocks();
diff --git a/src/cpp/src/icontinuous_batching.cpp b/src/cpp/src/icontinuous_batching.cpp
@@ -214,8 +214,12 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t re
                                         GenerationConfig sampling_params) {
     OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings.");
     ov::genai::VLMPerfMetrics metrics;
-    m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template);
-    ov::Tensor inputs = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, metrics);
+    ov::Tensor inputs;
+    {
+        std::lock_guard<std::mutex> lock(m_embeddings_mutex);
+        m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template);
+        inputs = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, metrics);
+    }
     return add_request(request_id, inputs, sampling_params);
 }
 
diff --git a/src/cpp/src/icontinuous_batching.hpp b/src/cpp/src/icontinuous_batching.hpp
@@ -57,6 +57,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline {
 
     ModelInputType m_model_input_type = ModelInputType::TOKENS;
     std::shared_ptr<InputsEmbedder> m_inputs_embedder;
+    std::mutex m_embeddings_mutex;
 
     void stream_tokens(const std::shared_ptr<ThreadedStreamerWrapper>& streamer_ptr, const GenerationHandle& handle);
 public:
diff --git a/src/cpp/src/visual_language/internvl_chat/classes.cpp b/src/cpp/src/visual_language/internvl_chat/classes.cpp
@@ -256,7 +256,9 @@ ov::Tensor InputsEmbedderInternVLChat::get_inputs_embeds(const std::string& prom
     ov::Tensor text_embeds = m_embedding->infer(input_ids);
 
     if (images.empty()) {
-        return text_embeds;
+        ov::Tensor inputs_embeds(text_embeds.get_element_type(), text_embeds.get_shape());
+        std::memcpy(inputs_embeds.data(), text_embeds.data(), text_embeds.get_byte_size());
+        return inputs_embeds;
     }
     auto start_tokenizer_time = std::chrono::steady_clock::now();
     ov::Tensor encoded_image_context_token = m_tokenizer.encode(image_context_token, ov::genai::add_special_tokens(false)).input_ids;
diff --git a/src/cpp/src/visual_language/llava/classes.cpp b/src/cpp/src/visual_language/llava/classes.cpp
@@ -135,7 +135,9 @@ ov::Tensor InputsEmbedderLLaVA::get_inputs_embeds(const std::string& prompt, con
     ov::Tensor text_embeds = m_embedding->infer(input_ids);
 
     if (images.empty()) {
-        return text_embeds;
+        ov::Tensor inputs_embeds(text_embeds.get_element_type(), text_embeds.get_shape());
+        std::memcpy(inputs_embeds.data(), text_embeds.data(), text_embeds.get_byte_size());
+        return inputs_embeds;
     }
     auto start_tokenizer_time = std::chrono::steady_clock::now();
     ov::Tensor encoded_image_token = m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids;
@@ -187,7 +189,11 @@ ov::Tensor InputsEmbedderLLaVA::merge_text_and_image_embeddings_llava(
         );
         token_offset -= n_tokens + 1;
     }
-    return text_embeds;
+    // text_embeds is bound to infer request that can be used by another thread after leaving embeddings calculation scope
+    // so we need to return a copy to make sure data does not get corrupted 
+    ov::Tensor inputs_embeds(text_embeds.get_element_type(), text_embeds.get_shape());
+    std::memcpy(inputs_embeds.data(), text_embeds.data(), text_embeds.get_byte_size());
+    return inputs_embeds;
 }
 
 } // namespace ov::genai
diff --git a/src/cpp/src/visual_language/llava_next/classes.cpp b/src/cpp/src/visual_language/llava_next/classes.cpp
@@ -373,7 +373,9 @@ ov::Tensor InputsEmbedderLLaVANext::get_inputs_embeds(const std::string& prompt,
     ov::Tensor text_embeds = m_embedding->infer(input_ids);
 
     if (images.empty()) {
-        return text_embeds;
+        ov::Tensor inputs_embeds(text_embeds.get_element_type(), text_embeds.get_shape());
+        std::memcpy(inputs_embeds.data(), text_embeds.data(), text_embeds.get_byte_size());
+        return inputs_embeds;
     }
     auto start_tokenizer_time = std::chrono::steady_clock::now();
     ov::Tensor encoded_image_token = m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids;
diff --git a/src/cpp/src/visual_language/minicpm/classes.cpp b/src/cpp/src/visual_language/minicpm/classes.cpp
@@ -679,7 +679,11 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, c
         m_image_id = 0;
         m_prev_image_id = 0;
     }
-    return inputs_embeds;
+    // inputs_embeds is bound to infer request that can be used by another thread after leaving this scope
+    // so we need to return a copy to make sure data does not get corrupted 
+    ov::Tensor inputs_embeds_copy(inputs_embeds.get_element_type(), inputs_embeds.get_shape());
+    std::memcpy(inputs_embeds_copy.data(), inputs_embeds.data(), inputs_embeds.get_byte_size());
+    return inputs_embeds_copy;
 }
 
 void InputsEmbedderMiniCPM::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) {
diff --git a/src/cpp/src/visual_language/phi3_vision/classes.cpp b/src/cpp/src/visual_language/phi3_vision/classes.cpp
@@ -623,7 +623,6 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& prompt, con
     if (!m_is_chat_conversation) {
         m_tokens_per_images.clear();
     }
-
     return inputs_embeds;
 }
 
diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp
@@ -338,7 +338,9 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& prompt, c
         m_image_id = 0;
     }
     if (images.empty()) {
-        return text_embeds;
+        ov::Tensor inputs_embeds(text_embeds.get_element_type(), text_embeds.get_shape());
+        std::memcpy(inputs_embeds.data(), text_embeds.data(), text_embeds.get_byte_size());
+        return inputs_embeds;
     }
 
     return merge_text_and_image_embeddings_qwen2vl(input_ids, text_embeds, reordered_image_embeds, reordered_images_grid_thw, image_pad_token_id);

Original file line number	Diff line number	Diff line change
`@@ -679,7 +679,11 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, c`
`679`	`679`	`m_image_id = 0;`
`680`	`680`	`m_prev_image_id = 0;`
`681`	`681`	`}`
`682`		`- return inputs_embeds;`
	`682`	`+ // inputs_embeds is bound to infer request that can be used by another thread after leaving this scope`
	`683`	`+ // so we need to return a copy to make sure data does not get corrupted`
	`684`	`+ ov::Tensor inputs_embeds_copy(inputs_embeds.get_element_type(), inputs_embeds.get_shape());`
	`685`	`+ std::memcpy(inputs_embeds_copy.data(), inputs_embeds.data(), inputs_embeds.get_byte_size());`
	`686`	`+ return inputs_embeds_copy;`
`683`	`687`	`}`
`684`	`688`
`685`	`689`	`void InputsEmbedderMiniCPM::update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status) {`
Original file line number	Diff line number	Diff line change
`@@ -623,7 +623,6 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& prompt, con`
`623`	`623`	`if (!m_is_chat_conversation) {`
`624`	`624`	`m_tokens_per_images.clear();`
`625`	`625`	`}`
`626`		`-`
`627`	`626`	`return inputs_embeds;`
`628`	`627`	`}`
`629`	`628`
Original file line number	Diff line number	Diff line change
`@@ -338,7 +338,9 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& prompt, c`
`338`	`338`	`m_image_id = 0;`
`339`	`339`	`}`
`340`	`340`	`if (images.empty()) {`
`341`		`- return text_embeds;`
	`341`	`+ ov::Tensor inputs_embeds(text_embeds.get_element_type(), text_embeds.get_shape());`
	`342`	`+ std::memcpy(inputs_embeds.data(), text_embeds.data(), text_embeds.get_byte_size());`
	`343`	`+ return inputs_embeds;`
`342`	`344`	`}`
`343`	`345`
`344`	`346`	`return merge_text_and_image_embeddings_qwen2vl(input_ids, text_embeds, reordered_image_embeds, reordered_images_grid_thw, image_pad_token_id);`