I'm handling a string with special characters and I want to split it into Unicode characters, but apparently OCaml strings are encoded in utf-8 while OCaml chars are 1-byte values, and it appears that there is no built-in function that even attempts to decode the utf-8 characters.
Here is a correct way to “explode” a UTF8-encoded string (of type string
) to a sequence of code points (of type Uchar.t
), only using the standard library; specifically, String
and Uchar
. You may also convert the code points to integers, by using Uchar.to_int : Uchar.t -> int
.
Below we choose to deal with invalid input (input that cannot be the UTF8 encoding of something) by inserting the replacement character ("�"
, U+FFFD, see Uchar.rep
). Alternatively, we may want to signal an error (e.g. by throwing an exception), for that purpose we can check Uchar.is_valid d
.
let uchar_seq_of_utf8 (s : string) : Uchar.t Seq.t =
let n = String.length s in
let rec aux i () =
if i >= n then
Seq.Nil
else
let d = String.get_utf_8_uchar s i in
let k = Uchar.utf_decode_length d in
let u = Uchar.utf_decode_uchar d in
(* ^ if d is an invalid utf8 sequence, then k = 1
* and u = the replacement character (U+FFFD) *)
Seq.Cons (u, aux (i+k))
in
aux 0
Examples (assuming your terminal input is UTF8, which should be the case nowadays):
# let test s =
s
|> uchar_seq_of_utf8
|> Seq.map Uchar.to_int
|> Seq.map (Printf.sprintf "U+%02X")
|> List.of_seq
;;
val test : 'a -> 'a list = <fun>
# test "hello" ;;
- : string list = ["U+68"; "U+65"; "U+6C"; "U+6C"; "U+6F"]
# test "olá" ;;
- : string list = ["U+6F"; "U+6C"; "U+E1"]
# test "안녕" ;;
- : string list = ["U+C548"; "U+B155"]
# test "hallöchen" ;;
- : string list = ["U+68"; "U+61"; "U+6C"; "U+6C"; "U+F6"; "U+63"; "U+68"; "U+65"; "U+6E"]
# test "hall\xF6chen" ;; (* this input is encoded as Latin-1, thus invalid when read as UTF8 *)
- : string list = ["U+68"; "U+61"; "U+6C"; "U+6C"; "U+FFFD"; "U+63"; "U+68"; "U+65"; "U+6E"]