I think Latin-1 is a subset of UTF-8, so there’s no way to differentiate UTF-8 from a bunch of Latin-1 characters.
Here is a nice decoder i found from NineNines that i converted to Elixir: https://ninenines.eu/articles/erlang-validate-utf8/
# This function returns 0 on success, 1 on error, and 2..8 on incomplete data.
def validate_utf8(<<>>, state), do: state
def validate_utf8(<< c, rest :: bits >>, 0) when c < 128, do: validate_utf8(rest, 0)
def validate_utf8(<< c, rest :: bits >>, 2) when c >= 128 when c < 144, do: validate_utf8(rest, 0)
def validate_utf8(<< c, rest :: bits >>, 3) when c >= 128 when c < 144, do: validate_utf8(rest, 2)
def validate_utf8(<< c, rest :: bits >>, 5) when c >= 128 when c < 144, do: validate_utf8(rest, 2)
def validate_utf8(<< c, rest :: bits >>, 7) when c >= 128 when c < 144, do: validate_utf8(rest, 3)
def validate_utf8(<< c, rest :: bits >>, 8) when c >= 128 when c < 144, do: validate_utf8(rest, 3)
def validate_utf8(<< c, rest :: bits >>, 2) when c >= 144 when c < 160, do: validate_utf8(rest, 0)
def validate_utf8(<< c, rest :: bits >>, 3) when c >= 144 when c < 160, do: validate_utf8(rest, 2)
def validate_utf8(<< c, rest :: bits >>, 5) when c >= 144 when c < 160, do: validate_utf8(rest, 2)
def validate_utf8(<< c, rest :: bits >>, 6) when c >= 144 when c < 160, do: validate_utf8(rest, 3)
def validate_utf8(<< c, rest :: bits >>, 7) when c >= 144 when c < 160, do: validate_utf8(rest, 3)
def validate_utf8(<< c, rest :: bits >>, 2) when c >= 160 when c < 192, do: validate_utf8(rest, 0)
def validate_utf8(<< c, rest :: bits >>, 3) when c >= 160 when c < 192, do: validate_utf8(rest, 2)
def validate_utf8(<< c, rest :: bits >>, 4) when c >= 160 when c < 192, do: validate_utf8(rest, 2)
def validate_utf8(<< c, rest :: bits >>, 6) when c >= 160 when c < 192, do: validate_utf8(rest, 3)
def validate_utf8(<< c, rest :: bits >>, 7) when c >= 160 when c < 192, do: validate_utf8(rest, 3)
def validate_utf8(<< c, rest :: bits >>, 0) when c >= 194 when c < 224, do: validate_utf8(rest, 2)
def validate_utf8(<< 224, rest :: bits >>, 0), do: validate_utf8(rest, 4)
def validate_utf8(<< c, rest :: bits >>, 0) when c >= 225 when c < 237, do: validate_utf8(rest, 3)
def validate_utf8(<< 237, rest :: bits >>, 0), do: validate_utf8(rest, 5)
def validate_utf8(<< c, rest :: bits >>, 0)
when c === 238
when c === 239,
do: validate_utf8(rest, 3)
def validate_utf8(<< 240, rest :: bits >>, 0), do: validate_utf8(rest, 6)
def validate_utf8(<< c, rest :: bits >>, 0)
when c === 241
when c === 242
when c === 243,
do: validate_utf8(rest, 7);
def validate_utf8(<< 244, rest :: bits >>, 0), do: validate_utf8(rest, 8)
def validate_utf8(_, _), do: 1
so i think you can use that to determine if your string is valid utf8 (0
), if its not valid utf8 (1
) its not either of those (2…8) its neither