Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 46 additions & 17 deletions src/z_string_normalize.erl
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,43 @@

-export([ normalize/1 ]).

%% File with all the word mappings in the priv folder.
-define(WORD_MAPPING_FILE, "normalize-words-mapping.csv").

%% Separators in a lowercased string.
-define(is_sep(C),
C < $0
orelse (C > $9 andalso C < $a)
orelse (C > $z andalso C < 127)
orelse C =:= 160 % non breaking space
orelse C =:= 8220 % left double quote
orelse C =:= 8221 % right double quote
orelse C =:= 8216 % left single quote
orelse C =:= 8217 % right single quote
orelse C =:= 8230 % ellipsis
orelse C =:= 8232 % line separator
orelse C =:= 8233 % paragraph separator
orelse C =:= 8212 % mdash
orelse C =:= 8211 % ndash
).

-define(is_map_space(C),
C =:= 65279 % byte order mark
orelse C =:= 8232 % line separator
orelse C =:= 8233 % paragraph separator
orelse C =:= 8203 % zero width space
orelse C =:= 8204 % zero width non-joiner
orelse C =:= 8023 % non breaking zero width space
).

-define(is_word_ignore_char(C),
C =:= 8205 % zero width joiner
orelse C =:= 8288 % word joiner
orelse C =:= 173 % soft hyphen
).




%% @doc Transliterate an unicode string to an ascii string with lowercase characters.
%% Tries to transliterate some characters to a..z
Expand All @@ -35,26 +70,18 @@ normalize({trans, [{_, First} | _] = Tr}) ->
V = proplists:get_value(en, Tr, First),
normalize(V).

%% Separators in a lowercased string
-define(is_sep(C),
C < $0
orelse (C > $9 andalso C < $a)
orelse (C > $z andalso C < 127)
orelse C =:= 8023 % non breaking zero width space
orelse C =:= 8212 % mdash
orelse C =:= 8211 % ndash
).

%% Normalize specific words using custom mappings from CSV file.
%% This allows language-specific transliterations that differ from
%% This allows language-specific transliterations that differ from
%% standard romanization rules.
normalize_words(B) when is_binary(B) ->
Ws = normalize_words_word(B, <<>>, []),
normalize(erlang:iolist_to_binary(Ws), <<>>).

normalize_words_word(<<>>, W, Acc) ->
lists:reverse([map_word(W)|Acc]);
normalize_words_word(<<C/utf8, T/binary>>, W, Acc) when ?is_sep(C) ->
normalize_words_word(<<C/utf8, T/binary>>, W, Acc) when ?is_word_ignore_char(C) ->
normalize_words_word(T, W, Acc);
normalize_words_word(<<C/utf8, T/binary>>, W, Acc) when ?is_sep(C) orelse ?is_map_space(C) ->
normalize_words_sep(T, <<C/utf8>>, [map_word(W)|Acc]);
normalize_words_word(<<C/utf8, T/binary>>, W, Acc) ->
normalize_words_word(T, <<W/binary, C/utf8>>, Acc);
Expand All @@ -64,7 +91,9 @@ normalize_words_word(<<_Byte, T/binary>>, W, Acc) ->

normalize_words_sep(<<>>, W, Acc) ->
lists:reverse([W|Acc]);
normalize_words_sep(<<C/utf8, T/binary>>, W, Acc) when not (?is_sep(C)) ->
normalize_words_sep(<<C/utf8, T/binary>>, W, Acc) when ?is_word_ignore_char(C) ->
normalize_words_sep(T, W, Acc);
normalize_words_sep(<<C/utf8, T/binary>>, W, Acc) when not (?is_sep(C) orelse ?is_map_space(C)) ->
normalize_words_word(T, <<C/utf8>>, [W|Acc]);
normalize_words_sep(<<C/utf8, T/binary>>, W, Acc) ->
normalize_words_sep(T, <<W/binary, C/utf8>>, Acc);
Expand Down Expand Up @@ -289,11 +318,11 @@ normalize(<<C/utf8,T/binary>>, Acc) when C >= 32, C =< 126 ->
normalize(<<C, T/binary>>, Acc) when C =:= $\n; C =:= $\t ->
% Keep newlines and tabs
normalize(T, <<Acc/binary, " ">>);
normalize(<<C/utf8,T/binary>>, Acc) when C < 32 ->
% Replace control characters with spaces
normalize(<<C/utf8,T/binary>>, Acc) when ?is_map_space(C) ->
% Replace control or space-like characters with spaces
normalize(T, <<Acc/binary, " ">>);
normalize(<<C/utf8,T/binary>>, Acc) when C =:= 8023 ->
% Zero width space
normalize(<<C/utf8,T/binary>>, Acc) when ?is_word_ignore_char(C) ->
% Zero width space et al
normalize(T, Acc);
normalize(<<C/utf8,T/binary>>, Acc) ->
% Try to remove any accents.
Expand Down
5 changes: 4 additions & 1 deletion test/z_string_test.erl
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,12 @@ truncatechars_test() ->
truncatewords_test() ->
?assertEqual(<<"foo bar x">>, z_string:truncatewords(<<"foo bar bla">>, 2, <<"x">>)).

normalize_map_words_test() ->
normalize_map_word_test() ->
?assertEqual(<<"odesa">>, z_string:normalize(<<"Одесса"/utf8>>)).

normalize_map_words_test() ->
?assertEqual(<<"the city odesa is ukrainian">>, z_string:normalize(<<"the city Одесса is Ukrainian"/utf8>>)).

normalize_test() ->
% binary()
?assertEqual(<<"a"/utf8>>, z_string:normalize(<<"ä"/utf8>>)),
Expand Down