faster invalid utf8 escaping

Consider replacing https://github.com/plausible/ch/blob/972f8d5a936d3af2f1e01b23dcf195f1c529d411/lib/ch/row_binary.ex#L619-L658 with one of these approaches:

- [ ] https://github.com/Moosieus/UniRecover/pull/1
- [ ] https://github.com/elixir-unicode/unicode/blob/main/lib/unicode/validation/utf8.ex

	@doc false
	def to_utf8(str) do
	utf8 = to_utf8(str, 0, 0, str, [])
	IO.iodata_to_binary(utf8)
	end

	@dialyzer {:no_improper_lists, to_utf8: 5, to_utf8_escape: 5}

	defp to_utf8(<<valid::utf8, rest::bytes>>, from, len, original, acc) do
	to_utf8(rest, from, len + utf8_size(valid), original, acc)
	end

	defp to_utf8(<<_invalid, rest::bytes>>, from, len, original, acc) do
	acc = [acc \| binary_part(original, from, len)]
	to_utf8_escape(rest, from + len, 1, original, acc)
	end

	defp to_utf8(<<>>, from, len, original, acc) do
	[acc \| binary_part(original, from, len)]
	end

	defp to_utf8_escape(<<valid::utf8, rest::bytes>>, from, len, original, acc) do
	acc = [acc \| "�"]
	to_utf8(rest, from + len, utf8_size(valid), original, acc)
	end

	defp to_utf8_escape(<<_invalid, rest::bytes>>, from, len, original, acc) do
	to_utf8_escape(rest, from, len + 1, original, acc)
	end

	defp to_utf8_escape(<<>>, _from, _len, _original, acc) do
	[acc \| "�"]
	end

	# UTF-8 encodes code points in one to four bytes
	@compile inline: [utf8_size: 1]
	defp utf8_size(codepoint) when codepoint <= 0x7F, do: 1
	defp utf8_size(codepoint) when codepoint <= 0x7FF, do: 2
	defp utf8_size(codepoint) when codepoint <= 0xFFFF, do: 3
	defp utf8_size(codepoint) when codepoint <= 0x10FFFF, do: 4

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

faster invalid utf8 escaping #128

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

faster invalid utf8 escaping #128

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions