From 410093a8f5125b1ee18c8fff705ff0cdf7244342 Mon Sep 17 00:00:00 2001 From: Marc Worrell Date: Thu, 13 Nov 2025 13:12:46 +0100 Subject: [PATCH] Also handle nbsp as word separators --- src/z_string_normalize.erl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/z_string_normalize.erl b/src/z_string_normalize.erl index 0170c73..5cd07aa 100644 --- a/src/z_string_normalize.erl +++ b/src/z_string_normalize.erl @@ -12,8 +12,20 @@ -export([ normalize/1 ]). +%% File with all the word mappings in the priv folder. -define(WORD_MAPPING_FILE, "normalize-words-mapping.csv"). +%% Separators in a lowercased string. +-define(is_sep(C), + C < $0 + orelse (C > $9 andalso C < $a) + orelse (C > $z andalso C < 127) + orelse C =:= 160 % non breaking space + orelse C =:= 8023 % non breaking zero width space + orelse C =:= 8212 % mdash + orelse C =:= 8211 % ndash + ). + %% @doc Transliterate an unicode string to an ascii string with lowercase characters. %% Tries to transliterate some characters to a..z @@ -35,6 +47,7 @@ normalize({trans, [{_, First} | _] = Tr}) -> V = proplists:get_value(en, Tr, First), normalize(V). +<<<<<<< Updated upstream %% Separators in a lowercased string -define(is_sep(C), C < $0 @@ -48,6 +61,10 @@ normalize({trans, [{_, First} | _] = Tr}) -> %% Normalize specific words using custom mappings from CSV file. %% This allows language-specific transliterations that differ from %% standard romanization rules. +======= +%% Normalize some common (Ukrainian) strings, that would be different when +%% using the Russian romanization rules. +>>>>>>> Stashed changes normalize_words(B) when is_binary(B) -> Ws = normalize_words_word(B, <<>>, []), normalize(erlang:iolist_to_binary(Ws), <<>>).