Skip to content

faster invalid utf8 escaping #128

@ruslandoga

Description

@ruslandoga

Consider replacing

ch/lib/ch/row_binary.ex

Lines 619 to 658 in 972f8d5

@doc false
def to_utf8(str) do
utf8 = to_utf8(str, 0, 0, str, [])
IO.iodata_to_binary(utf8)
end
@dialyzer {:no_improper_lists, to_utf8: 5, to_utf8_escape: 5}
defp to_utf8(<<valid::utf8, rest::bytes>>, from, len, original, acc) do
to_utf8(rest, from, len + utf8_size(valid), original, acc)
end
defp to_utf8(<<_invalid, rest::bytes>>, from, len, original, acc) do
acc = [acc | binary_part(original, from, len)]
to_utf8_escape(rest, from + len, 1, original, acc)
end
defp to_utf8(<<>>, from, len, original, acc) do
[acc | binary_part(original, from, len)]
end
defp to_utf8_escape(<<valid::utf8, rest::bytes>>, from, len, original, acc) do
acc = [acc | "�"]
to_utf8(rest, from + len, utf8_size(valid), original, acc)
end
defp to_utf8_escape(<<_invalid, rest::bytes>>, from, len, original, acc) do
to_utf8_escape(rest, from, len + 1, original, acc)
end
defp to_utf8_escape(<<>>, _from, _len, _original, acc) do
[acc | "�"]
end
# UTF-8 encodes code points in one to four bytes
@compile inline: [utf8_size: 1]
defp utf8_size(codepoint) when codepoint <= 0x7F, do: 1
defp utf8_size(codepoint) when codepoint <= 0x7FF, do: 2
defp utf8_size(codepoint) when codepoint <= 0xFFFF, do: 3
defp utf8_size(codepoint) when codepoint <= 0x10FFFF, do: 4
with one of these approaches:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions