stringutf-8ocaml

How can I quickly split an utf8 string into chars in OCaml?


I'm handling a string with special characters and I want to split it into Unicode characters, but apparently OCaml strings are encoded in utf-8 while OCaml chars are 1-byte values, and it appears that there is no built-in function that even attempts to decode the utf-8 characters.


Solution

  • Here is a correct way to “explode” a UTF8-encoded string (of type string) to a sequence of code points (of type Uchar.t), only using the standard library; specifically, String and Uchar. You may also convert the code points to integers, by using Uchar.to_int : Uchar.t -> int.

    Below we choose to deal with invalid input (input that cannot be the UTF8 encoding of something) by inserting the replacement character ("�", U+FFFD, see Uchar.rep). Alternatively, we may want to signal an error (e.g. by throwing an exception), for that purpose we can check Uchar.is_valid d.

    let uchar_seq_of_utf8 (s : string) : Uchar.t Seq.t =
      let n = String.length s in
      let rec aux i () =
        if i >= n then
          Seq.Nil
        else
          let d = String.get_utf_8_uchar s i in
          let k = Uchar.utf_decode_length d in
          let u = Uchar.utf_decode_uchar d in
          (* ^ if d is an invalid utf8 sequence, then k = 1
           *   and u = the replacement character (U+FFFD) *)
          Seq.Cons (u, aux (i+k))
      in
      aux 0
    

    Examples (assuming your terminal input is UTF8, which should be the case nowadays):

    # let test s =
      s
      |> uchar_seq_of_utf8
      |> Seq.map Uchar.to_int
      |> Seq.map (Printf.sprintf "U+%02X")
      |> List.of_seq
    ;;
    val test : 'a -> 'a list = <fun>
    # test "hello" ;;
    - : string list = ["U+68"; "U+65"; "U+6C"; "U+6C"; "U+6F"]
    # test "olá" ;;
    - : string list = ["U+6F"; "U+6C"; "U+E1"]
    # test "안녕" ;;
    - : string list = ["U+C548"; "U+B155"]
    # test "hallöchen" ;;
    - : string list = ["U+68"; "U+61"; "U+6C"; "U+6C"; "U+F6"; "U+63"; "U+68"; "U+65"; "U+6E"]
    # test "hall\xF6chen" ;; (* this input is encoded as Latin-1, thus invalid when read as UTF8 *)
    - : string list = ["U+68"; "U+61"; "U+6C"; "U+6C"; "U+FFFD"; "U+63"; "U+68"; "U+65"; "U+6E"]