Skip to content

Commit 85be884

Browse files
authored
[GGUF] Create tokenizers factory for GGUF support in OpenVINO GenAI (#494)
* [GGUF] Create tokenizers factory method Needed for GGUF tokenizers support Signed-off-by: Kazantsev, Roman <[email protected]> * Clean-up implementation Signed-off-by: Kazantsev, Roman <[email protected]> * Update src/tokenizers_factory.cpp * Update src/tokenizers_factory.cpp --------- Signed-off-by: Kazantsev, Roman <[email protected]>
1 parent 130827a commit 85be884

File tree

2 files changed

+93
-0
lines changed

2 files changed

+93
-0
lines changed

src/tokenizers_factory.cpp

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// Copyright (C) 2018-2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "tokenizers_factory.hpp"
6+
7+
#include "openvino/core/except.hpp"
8+
#include "tokenizer.hpp"
9+
10+
namespace ov {
11+
namespace tokenizers {
12+
13+
namespace {
14+
template <typename T>
15+
T get_attribute_value(const ov::AnyMap& attributes, const std::string& attribute_name, const T& default_value) {
16+
return attributes.count(attribute_name) && attributes.at(attribute_name).is<T>()
17+
? attributes.at(attribute_name).as<T>()
18+
: default_value;
19+
}
20+
21+
} // namespace
22+
23+
ov::OutputVector create_tokenizer_node(const std::string& op_type,
24+
const ov::OutputVector& inputs,
25+
const ov::AnyMap& attributes) {
26+
if (op_type == "StringTensorUnpack") {
27+
return std::make_shared<StringTensorUnpack>(inputs)->outputs();
28+
} else if (op_type == "SpecialTokensSplit") {
29+
return std::make_shared<SpecialTokensSplit>(inputs)->outputs();
30+
} else if (op_type == "RegexSplit") {
31+
auto behaviour = get_attribute_value<std::string>(attributes, "behaviour", "remove");
32+
auto invert = get_attribute_value<bool>(attributes, "invert", false);
33+
return std::make_shared<RegexSplit>(inputs, behaviour, invert)->outputs();
34+
} else if (op_type == "RaggedToDense") {
35+
auto pad_right = get_attribute_value<bool>(attributes, "pad_right", true);
36+
auto pad_max_length = get_attribute_value<bool>(attributes, "pad_max_length", false);
37+
return std::make_shared<RaggedToDense>(inputs, pad_right, pad_max_length)->outputs();
38+
} else if (op_type == "VocabDecoder") {
39+
return std::make_shared<VocabDecoder>(inputs, std::vector<int32_t>{})->outputs();
40+
} else if (op_type == "FuzeRagged") {
41+
return std::make_shared<FuzeRagged>(inputs)->outputs();
42+
} else if (op_type == "StringTensorPack") {
43+
return std::make_shared<StringTensorPack>(inputs)->outputs();
44+
} else if (op_type == "BPETokenizer") {
45+
auto unk_token = get_attribute_value<std::string>(attributes, "unk_token", "");
46+
auto fuse_unk = get_attribute_value<bool>(attributes, "fuse_unk", false);
47+
auto suffix_indicator = get_attribute_value<std::string>(attributes, "suffix_indicator", "");
48+
auto end_suffix = get_attribute_value<std::string>(attributes, "end_suffix", "");
49+
auto byte_fallback = get_attribute_value<bool>(attributes, "byte_fallback", false);
50+
return std::make_shared<BPETokenizer>(inputs, unk_token, fuse_unk, suffix_indicator, end_suffix, byte_fallback)
51+
->outputs();
52+
}
53+
OPENVINO_THROW("Unsupported operation type: `", op_type, "`");
54+
}
55+
56+
} // namespace tokenizers
57+
} // namespace ov

src/tokenizers_factory.hpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright (C) 2018-2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
7+
#include "openvino/core/core.hpp"
8+
9+
namespace ov {
10+
namespace tokenizers {
11+
12+
/**
13+
* @brief Creates an OpenVINO operation node of the specified type with the given inputs and attributes.
14+
*
15+
* This function constructs a node in the OpenVINO computational graph based on the provided operation type,
16+
* input tensors, and a map of operation-specific attributes. It returns the output(s) produced by the node.
17+
*
18+
* @note This function is used exclusively by OpenVINO GenAI to create tokenizer and detokenizer operations
19+
* from a GGUF file. It is expected to be an external symbol that is located at runtime via `dlopen`.
20+
*
21+
* @warning The signature of this function must not be changed. It is dynamically loaded at runtime,
22+
* and any modifications will break compatibility with OpenVINO GenAI.
23+
*
24+
* @param op_type A string specifying the type of operation to create (e.g., "BPETokenizer").
25+
* @param inputs A vector of OpenVINO outputs (`ov::OutputVector`) representing the input tensors to the operation.
26+
* @param attributes A map (`ov::AnyMap`) containing operation-specific attributes.
27+
*
28+
* @return ov::OutputVector A vector containing the output(s) of the created node. The number of outputs depends on the
29+
* operation type.
30+
*/
31+
32+
OPENVINO_API_C(ov::OutputVector)
33+
create_tokenizer_node(const std::string& op_type, const ov::OutputVector& inputs, const ov::AnyMap& attributes);
34+
35+
} // namespace tokenizers
36+
} // namespace ov

0 commit comments

Comments
 (0)