Skip to content

Commit 236627c

Browse files
committed
Moving UTF-8 related functions to Unicode module.
1 parent 45de05d commit 236627c

File tree

5 files changed

+77
-35
lines changed

5 files changed

+77
-35
lines changed

lib/pp.ml

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -168,38 +168,6 @@ let utf8_length s =
168168
done ;
169169
!cnt
170170

171-
(* Variant of String.sub for UTF8 character positions *)
172-
let utf8_sub s start_u len_u =
173-
let len_b = String.length s
174-
and end_u = start_u + len_u
175-
and cnt = ref 0
176-
and nc = ref 0
177-
and p = ref 0 in
178-
let start_b = ref len_b in
179-
while !p < len_b && !cnt < end_u do
180-
if !cnt <= start_u then start_b := !p ;
181-
begin
182-
match s.[!p] with
183-
| '\000'..'\127' -> nc := 0 (* ascii char *)
184-
| '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
185-
| '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
186-
| '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
187-
| '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
188-
| '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
189-
| '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
190-
| '\254'..'\255' -> nc := 0 (* invalid byte *)
191-
end ;
192-
incr p ;
193-
while !p < len_b && !nc > 0 do
194-
match s.[!p] with
195-
| '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
196-
| _ (* not a continuation byte *) -> nc := 0
197-
done ;
198-
incr cnt
199-
done ;
200-
let end_b = !p in
201-
String.sub s !start_b (end_b - !start_b)
202-
203171
(* formatting commands *)
204172
let str s = Glue.atom(Ppcmd_print (Str_def s))
205173
let stras (i, s) = Glue.atom(Ppcmd_print (Str_len (s, i)))

lib/pp.mli

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,6 @@ val close_tag : unit -> std_ppcmds
100100

101101
val string_of_ppcmds : std_ppcmds -> string
102102

103-
val utf8_length : string -> int
104-
val utf8_sub : string -> int -> int -> string
105-
106103
(** {6 Printing combinators} *)
107104

108105
val pr_comma : unit -> std_ppcmds

lib/unicode.ml

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,3 +261,73 @@ let ascii_of_ident s =
261261
(Buffer.add_char out s.[!i]; incr i)
262262
done;
263263
Buffer.contents out
264+
265+
(* Compute length of an UTF-8 encoded string
266+
Rem 1 : utf8_length <= String.length (equal if pure ascii)
267+
Rem 2 : if used for an iso8859_1 encoded string, the result is
268+
wrong in very rare cases. Such a wrong case corresponds to any
269+
sequence of a character in range 192..253 immediately followed by a
270+
character in range 128..191 (typical case in french is "déçu" which
271+
is counted 3 instead of 4); then no real harm to use always
272+
utf8_length even if using an iso8859_1 encoding *)
273+
274+
(** FIXME: duplicate code with Pp *)
275+
276+
let utf8_length s =
277+
let len = String.length s
278+
and cnt = ref 0
279+
and nc = ref 0
280+
and p = ref 0 in
281+
while !p < len do
282+
begin
283+
match s.[!p] with
284+
| '\000'..'\127' -> nc := 0 (* ascii char *)
285+
| '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
286+
| '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
287+
| '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
288+
| '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
289+
| '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
290+
| '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
291+
| '\254'..'\255' -> nc := 0 (* invalid byte *)
292+
end ;
293+
incr p ;
294+
while !p < len && !nc > 0 do
295+
match s.[!p] with
296+
| '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
297+
| _ (* not a continuation byte *) -> nc := 0
298+
done ;
299+
incr cnt
300+
done ;
301+
!cnt
302+
303+
(* Variant of String.sub for UTF8 character positions *)
304+
let utf8_sub s start_u len_u =
305+
let len_b = String.length s
306+
and end_u = start_u + len_u
307+
and cnt = ref 0
308+
and nc = ref 0
309+
and p = ref 0 in
310+
let start_b = ref len_b in
311+
while !p < len_b && !cnt < end_u do
312+
if !cnt <= start_u then start_b := !p ;
313+
begin
314+
match s.[!p] with
315+
| '\000'..'\127' -> nc := 0 (* ascii char *)
316+
| '\128'..'\191' -> nc := 0 (* cannot start with a continuation byte *)
317+
| '\192'..'\223' -> nc := 1 (* expect 1 continuation byte *)
318+
| '\224'..'\239' -> nc := 2 (* expect 2 continuation bytes *)
319+
| '\240'..'\247' -> nc := 3 (* expect 3 continuation bytes *)
320+
| '\248'..'\251' -> nc := 4 (* expect 4 continuation bytes *)
321+
| '\252'..'\253' -> nc := 5 (* expect 5 continuation bytes *)
322+
| '\254'..'\255' -> nc := 0 (* invalid byte *)
323+
end ;
324+
incr p ;
325+
while !p < len_b && !nc > 0 do
326+
match s.[!p] with
327+
| '\128'..'\191' (* next continuation byte *) -> incr p ; decr nc
328+
| _ (* not a continuation byte *) -> nc := 0
329+
done ;
330+
incr cnt
331+
done ;
332+
let end_b = !p in
333+
String.sub s !start_b (end_b - !start_b)

lib/unicode.mli

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,9 @@ val ascii_of_ident : string -> string
4040

4141
(** Validate an UTF-8 string *)
4242
val is_utf8 : string -> bool
43+
44+
(** Return the length of a valid UTF-8 string. *)
45+
val utf8_length : string -> int
46+
47+
(** Variant of {!String.sub} for UTF-8 strings. *)
48+
val utf8_sub : string -> int -> int -> string

ltacprof/profile_ltac.ml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
open Unicode
12
open Pp
23
open Printer
34
open Util

0 commit comments

Comments
 (0)