Moving UTF-8 related functions to Unicode module.

ppedrot · ppedrot · commit 236627cdf081 · 2016-06-14T00:09:30.000+02:00
diff --git a/lib/pp.ml b/lib/pp.ml
@@ -168,38 +168,6 @@ let utf8_length s =
   done ;
   !cnt
 
-(* Variant of String.sub for UTF8 character positions *)
-let utf8_sub s start_u len_u =
-  let len_b = String.length s
-  and end_u = start_u + len_u
-  and cnt = ref 0
-  and nc = ref 0
-  and p = ref 0 in
-  let start_b = ref len_b in
-  while !p < len_b && !cnt < end_u do
-    if !cnt <= start_u then start_b := !p ;
-    begin
-      match s.[!p] with
-      | '\000'..'\127' -> nc := 0 (* ascii char *)
-      | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
-      |	'\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
-      |	'\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
-      |	'\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
-      |	'\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
-      |	'\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
-      |	'\254'..'\255' -> nc := 0 (* invalid byte *)
-    end ;
-    incr p ;
-    while !p < len_b && !nc > 0 do
-      match s.[!p] with
-      |	'\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
-      |	_ (* not a continuation byte *) -> nc := 0
-    done ;
-    incr cnt
-  done ;
-  let end_b = !p in
-  String.sub s !start_b (end_b - !start_b)
-
 (* formatting commands *)
 let str s = Glue.atom(Ppcmd_print (Str_def s))
 let stras (i, s) = Glue.atom(Ppcmd_print (Str_len (s, i)))
diff --git a/lib/pp.mli b/lib/pp.mli
@@ -100,9 +100,6 @@ val close_tag : unit -> std_ppcmds
 
 val string_of_ppcmds : std_ppcmds -> string
 
-val utf8_length : string -> int
-val utf8_sub : string -> int -> int -> string
-
 (** {6 Printing combinators} *)
 
 val pr_comma : unit -> std_ppcmds
diff --git a/lib/unicode.ml b/lib/unicode.ml
@@ -261,3 +261,73 @@ let ascii_of_ident s =
         (Buffer.add_char out s.[!i]; incr i)
     done;
     Buffer.contents out
+
+(* Compute length of an UTF-8 encoded string
+   Rem 1 : utf8_length <= String.length (equal if pure ascii)
+   Rem 2 : if used for an iso8859_1 encoded string, the result is
+   wrong in very rare cases. Such a wrong case corresponds to any
+   sequence of a character in range 192..253 immediately followed by a
+   character in range 128..191 (typical case in french is "déçu" which
+   is counted 3 instead of 4); then no real harm to use always
+   utf8_length even if using an iso8859_1 encoding *)
+
+(** FIXME: duplicate code with Pp *)
+
+let utf8_length s =
+  let len = String.length s
+  and cnt = ref 0
+  and nc = ref 0
+  and p = ref 0 in
+  while !p < len do
+    begin
+      match s.[!p] with
+      | '\000'..'\127' -> nc := 0 (* ascii char *)
+      | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
+      | '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
+      | '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
+      | '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
+      | '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
+      | '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
+      | '\254'..'\255' -> nc := 0 (* invalid byte *)
+    end ;
+    incr p ;
+    while !p < len && !nc > 0 do
+      match s.[!p] with
+      | '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
+      | _ (* not a continuation byte *) -> nc := 0
+    done ;
+    incr cnt
+  done ;
+  !cnt
+
+(* Variant of String.sub for UTF8 character positions *)
+let utf8_sub s start_u len_u =
+  let len_b = String.length s
+  and end_u = start_u + len_u
+  and cnt = ref 0
+  and nc = ref 0
+  and p = ref 0 in
+  let start_b = ref len_b in
+  while !p < len_b && !cnt < end_u do
+    if !cnt <= start_u then start_b := !p ;
+    begin
+      match s.[!p] with
+      | '\000'..'\127' -> nc := 0 (* ascii char *)
+      | '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
+      |	'\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
+      |	'\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
+      |	'\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
+      |	'\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
+      |	'\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
+      |	'\254'..'\255' -> nc := 0 (* invalid byte *)
+    end ;
+    incr p ;
+    while !p < len_b && !nc > 0 do
+      match s.[!p] with
+      |	'\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
+      |	_ (* not a continuation byte *) -> nc := 0
+    done ;
+    incr cnt
+  done ;
+  let end_b = !p in
+  String.sub s !start_b (end_b - !start_b)
diff --git a/lib/unicode.mli b/lib/unicode.mli
@@ -40,3 +40,9 @@ val ascii_of_ident : string -> string
 
 (** Validate an UTF-8 string *)
 val is_utf8 : string -> bool
+
+(** Return the length of a valid UTF-8 string. *)
+val utf8_length : string -> int
+
+(** Variant of {!String.sub} for UTF-8 strings. *)
+val utf8_sub : string -> int -> int -> string
diff --git a/ltacprof/profile_ltac.ml b/ltacprof/profile_ltac.ml
@@ -1,3 +1,4 @@
+open Unicode
 open Pp
 open Printer
 open Util

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+open Unicode`
`1`	`2`	`open Pp`
`2`	`3`	`open Printer`
`3`	`4`	`open Util`