[SOLVED] How can I quickly split an utf8 string into chars in OCaml?

How can I quickly split an utf8 string into chars in OCaml?

I'm handling a string with special characters and I want to split it into Unicode characters, but apparently OCaml strings are encoded in utf-8 while OCaml chars are 1-byte values, and it appears that there is no built-in function that even attempts to decode the utf-8 characters.

Solution

Here is a correct way to “explode” a UTF8-encoded string (of type string) to a sequence of code points (of type Uchar.t), only using the standard library; specifically, String and Uchar. You may also convert the code points to integers, by using Uchar.to_int : Uchar.t -> int.

Below we choose to deal with invalid input (input that cannot be the UTF8 encoding of something) by inserting the replacement character ("�", U+FFFD, see Uchar.rep). Alternatively, we may want to signal an error (e.g. by throwing an exception), for that purpose we can check Uchar.is_valid d.

let uchar_seq_of_utf8 (s : string) : Uchar.t Seq.t =
  let n = String.length s in
  let rec aux i () =
    if i >= n then
      Seq.Nil
    else
      let d = String.get_utf_8_uchar s i in
      let k = Uchar.utf_decode_length d in
      let u = Uchar.utf_decode_uchar d in
      (* ^ if d is an invalid utf8 sequence, then k = 1
       *   and u = the replacement character (U+FFFD) *)
      Seq.Cons (u, aux (i+k))
  in
  aux 0

Examples (assuming your terminal input is UTF8, which should be the case nowadays):

# let test s =
  s
  |> uchar_seq_of_utf8
  |> Seq.map Uchar.to_int
  |> Seq.map (Printf.sprintf "U+%02X")
  |> List.of_seq
;;
val test : 'a -> 'a list = <fun>
# test "hello" ;;
- : string list = ["U+68"; "U+65"; "U+6C"; "U+6C"; "U+6F"]
# test "olá" ;;
- : string list = ["U+6F"; "U+6C"; "U+E1"]
# test "안녕" ;;
- : string list = ["U+C548"; "U+B155"]
# test "hallöchen" ;;
- : string list = ["U+68"; "U+61"; "U+6C"; "U+6C"; "U+F6"; "U+63"; "U+68"; "U+65"; "U+6E"]
# test "hall\xF6chen" ;; (* this input is encoded as Latin-1, thus invalid when read as UTF8 *)
- : string list = ["U+68"; "U+61"; "U+6C"; "U+6C"; "U+FFFD"; "U+63"; "U+68"; "U+65"; "U+6E"]