|
| 1 | +// Copyright (C) 2018-2025 Intel Corporation |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | +// |
| 4 | + |
| 5 | +#include "tokenizers_factory.hpp" |
| 6 | + |
| 7 | +#include "openvino/core/except.hpp" |
| 8 | +#include "tokenizer.hpp" |
| 9 | + |
| 10 | +namespace ov { |
| 11 | +namespace tokenizers { |
| 12 | + |
| 13 | +namespace { |
| 14 | +template <typename T> |
| 15 | +T get_attribute_value(const ov::AnyMap& attributes, const std::string& attribute_name, const T& default_value) { |
| 16 | + return attributes.count(attribute_name) && attributes.at(attribute_name).is<T>() |
| 17 | + ? attributes.at(attribute_name).as<T>() |
| 18 | + : default_value; |
| 19 | +} |
| 20 | + |
| 21 | +} // namespace |
| 22 | + |
| 23 | +ov::OutputVector create_tokenizer_node(const std::string& op_type, |
| 24 | + const ov::OutputVector& inputs, |
| 25 | + const ov::AnyMap& attributes) { |
| 26 | + if (op_type == "StringTensorUnpack") { |
| 27 | + return std::make_shared<StringTensorUnpack>(inputs)->outputs(); |
| 28 | + } else if (op_type == "SpecialTokensSplit") { |
| 29 | + return std::make_shared<SpecialTokensSplit>(inputs)->outputs(); |
| 30 | + } else if (op_type == "RegexSplit") { |
| 31 | + auto behaviour = get_attribute_value<std::string>(attributes, "behaviour", "remove"); |
| 32 | + auto invert = get_attribute_value<bool>(attributes, "invert", false); |
| 33 | + return std::make_shared<RegexSplit>(inputs, behaviour, invert)->outputs(); |
| 34 | + } else if (op_type == "RaggedToDense") { |
| 35 | + auto pad_right = get_attribute_value<bool>(attributes, "pad_right", true); |
| 36 | + auto pad_max_length = get_attribute_value<bool>(attributes, "pad_max_length", false); |
| 37 | + return std::make_shared<RaggedToDense>(inputs, pad_right, pad_max_length)->outputs(); |
| 38 | + } else if (op_type == "VocabDecoder") { |
| 39 | + return std::make_shared<VocabDecoder>(inputs, std::vector<int32_t>{})->outputs(); |
| 40 | + } else if (op_type == "FuzeRagged") { |
| 41 | + return std::make_shared<FuzeRagged>(inputs)->outputs(); |
| 42 | + } else if (op_type == "StringTensorPack") { |
| 43 | + return std::make_shared<StringTensorPack>(inputs)->outputs(); |
| 44 | + } else if (op_type == "BPETokenizer") { |
| 45 | + auto unk_token = get_attribute_value<std::string>(attributes, "unk_token", ""); |
| 46 | + auto fuse_unk = get_attribute_value<bool>(attributes, "fuse_unk", false); |
| 47 | + auto suffix_indicator = get_attribute_value<std::string>(attributes, "suffix_indicator", ""); |
| 48 | + auto end_suffix = get_attribute_value<std::string>(attributes, "end_suffix", ""); |
| 49 | + auto byte_fallback = get_attribute_value<bool>(attributes, "byte_fallback", false); |
| 50 | + return std::make_shared<BPETokenizer>(inputs, unk_token, fuse_unk, suffix_indicator, end_suffix, byte_fallback) |
| 51 | + ->outputs(); |
| 52 | + } |
| 53 | + OPENVINO_THROW("Unsupported operation type: `", op_type, "`"); |
| 54 | +} |
| 55 | + |
| 56 | +} // namespace tokenizers |
| 57 | +} // namespace ov |
0 commit comments