1 (***********************************************************************)
5 (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
7 (* Copyright 1996 Institut National de Recherche en Informatique et *)
8 (* en Automatique. All rights reserved. This file is distributed *)
9 (* under the terms of the GNU Library General Public License, with *)
10 (* the special exception on linking described in file ../../LICENSE. *)
12 (***********************************************************************)
14 (* $Id: str.ml 9157 2008-12-12 08:54:15Z xleroy $ *)
16 (** String utilities *)
18 let string_before s n = String.sub s 0 n
20 let string_after s n = String.sub s n (String.length s - n)
22 let first_chars s n = String.sub s 0 n
24 let last_chars s n = String.sub s (String.length s - n) n
26 (** Representation of character sets **)
30 type t = string (* of length 32 *)
32 let empty = String.make 32 '\000'
33 let full = String.make 32 '\255'
35 let make_empty () = String.make 32 '\000'
38 let i = Char.code c in
39 s.[i lsr 3] <- Char.chr(Char.code s.[i lsr 3] lor (1 lsl (i land 7)))
41 let add_range s c1 c2 =
42 for i = Char.code c1 to Char.code c2 do add s (Char.chr i) done
45 let s = make_empty () in add s c; s
48 let s = make_empty () in add_range s c1 c2; s
51 let r = String.create 32 in
53 r.[i] <- Char.chr(Char.code s.[i] lxor 0xFF)
58 let r = String.create 32 in
60 r.[i] <- Char.chr(Char.code s1.[i] lor Char.code s2.[i])
67 if Char.code s1.[i] land Char.code s2.[i] <> 0 then raise Exit
75 let c = Char.code s.[i] in
78 if c land (1 lsl j) <> 0 then fn (Char.chr ((i lsl 3) + j))
83 let r = String.make 256 '\000' in
84 iter (fun c -> r.[Char.code c] <- '\001') s;
88 let r = make_empty() in
89 iter (fun c -> add r (Char.lowercase c); add r (Char.uppercase c)) s;
94 (** Abstract syntax tree for regular expressions *)
99 | CharClass of Charset.t * bool (* true = complemented, false = normal *)
100 | Seq of re_syntax list
101 | Alt of re_syntax * re_syntax
104 | Option of re_syntax
105 | Group of int * re_syntax
111 (** Representation of compiled regular expressions *)
114 prog: int array; (* bytecode instructions *)
115 cpool: string array; (* constant pool (string literals) *)
116 normtable: string; (* case folding table (if any) *)
117 numgroups: int; (* number of \(...\) groups *)
118 numregisters: int; (* number of nullable Star or Plus *)
119 startchars: int (* index of set of starting chars, or -1 if none *)
122 (** Opcodes for bytecode instructions; see strstubs.c for description *)
127 let op_STRINGNORM = 3
131 let op_WORDBOUNDARY = 7
136 let op_SIMPLEOPT = 12
137 let op_SIMPLESTAR = 13
138 let op_SIMPLEPLUS = 14
142 let op_CHECKPROGRESS = 18
144 (* Encoding of bytecode instructions *)
146 let instr opc arg = opc lor (arg lsl 8)
148 (* Computing relative displacements for GOTO and PUSHBACK instructions *)
150 let displ dest from = dest - from - 1
152 (** Compilation of a regular expression *)
154 (* Determine if a regexp can match the empty string *)
156 let rec is_nullable = function
159 | CharClass(cl, cmpl) -> false
160 | Seq rl -> List.for_all is_nullable rl
161 | Alt (r1, r2) -> is_nullable r1 || is_nullable r2
163 | Plus r -> is_nullable r
165 | Group(n, r) -> is_nullable r
169 | Wordboundary -> true
171 (* first r returns a set of characters C such that:
172 for all string s, s matches r => the first character of s is in C.
173 For convenience, return Charset.full if r is nullable. *)
175 let rec first = function
176 Char c -> Charset.singleton c
177 | String s -> if s = "" then Charset.full else Charset.singleton s.[0]
178 | CharClass(cl, cmpl) -> if cmpl then Charset.complement cl else cl
179 | Seq rl -> first_seq rl
180 | Alt (r1, r2) -> Charset.union (first r1) (first r2)
181 | Star r -> Charset.full
183 | Option r -> Charset.full
184 | Group(n, r) -> first r
185 | Refgroup n -> Charset.full
186 | Bol -> Charset.full
187 | Eol -> Charset.full
188 | Wordboundary -> Charset.full
190 and first_seq = function
192 | (Bol | Eol | Wordboundary) :: rl -> first_seq rl
193 | Star r :: rl -> Charset.union (first r) (first_seq rl)
194 | Option r :: rl -> Charset.union (first r) (first_seq rl)
197 (* Transform a Char or CharClass regexp into a character class *)
199 let charclass_of_regexp fold_case re =
202 | Char c -> (Charset.singleton c, false)
203 | CharClass(cl, compl) -> (cl, compl)
204 | _ -> assert false in
205 let cl2 = if fold_case then Charset.fold_case cl1 else cl1 in
206 if compl then Charset.complement cl2 else cl2
208 (* The case fold table: maps characters to their lowercase equivalent *)
210 let fold_case_table =
211 let t = String.create 256 in
212 for i = 0 to 255 do t.[i] <- Char.lowercase(Char.chr i) done;
215 module StringMap = Map.Make(struct type t = string let compare = compare end)
217 (* Compilation of a regular expression *)
219 let compile fold_case re =
221 (* Instruction buffering *)
222 let prog = ref (Array.make 32 0)
224 and cpool = ref StringMap.empty
226 and numgroups = ref 1
227 and numregs = ref 0 in
228 (* Add a new instruction *)
229 let emit_instr opc arg =
230 if !progpos >= Array.length !prog then begin
231 let newlen = ref (Array.length !prog) in
232 while !progpos >= !newlen do newlen := !newlen * 2 done;
233 let nprog = Array.make !newlen 0 in
234 Array.blit !prog 0 nprog 0 (Array.length !prog);
237 (!prog).(!progpos) <- (instr opc arg);
239 (* Reserve an instruction slot and return its position *)
241 let p = !progpos in incr progpos; p in
242 (* Fill a reserved instruction slot with a GOTO or PUSHBACK instruction *)
243 let patch_instr pos opc dest =
244 (!prog).(pos) <- (instr opc (displ dest pos)) in
245 (* Return the cpool index for the given string, adding it if not
249 StringMap.find s !cpool
252 cpool := StringMap.add s p !cpool;
255 (* Allocate fresh register if regexp is nullable *)
256 let allocate_register_if_nullable r =
257 if is_nullable r then begin
259 if n >= 64 then failwith "too many r* or r+ where r is nullable";
264 (* Main recursive compilation function *)
265 let rec emit_code = function
268 emit_instr op_CHARNORM (Char.code (Char.lowercase c))
270 emit_instr op_CHAR (Char.code c)
272 begin match String.length s with
276 emit_instr op_CHARNORM (Char.code (Char.lowercase s.[0]))
278 emit_instr op_CHAR (Char.code s.[0])
281 (* null characters are not accepted by the STRING* instructions;
282 if one is found, split string at null character *)
283 let i = String.index s '\000' in
284 emit_code (String (string_before s i));
285 emit_instr op_CHAR 0;
286 emit_code (String (string_after s (i+1)))
289 emit_instr op_STRINGNORM (cpool_index (String.lowercase s))
291 emit_instr op_STRING (cpool_index s)
293 | CharClass(cl, compl) ->
294 let cl1 = if fold_case then Charset.fold_case cl else cl in
295 let cl2 = if compl then Charset.complement cl1 else cl1 in
296 emit_instr op_CHARCLASS (cpool_index cl2)
305 let pos_pushback = emit_hole() in
307 let pos_goto_end = emit_hole() in
308 let lbl1 = !progpos in
310 let lbl2 = !progpos in
311 patch_instr pos_pushback op_PUSHBACK lbl1;
312 patch_instr pos_goto_end op_GOTO lbl2
314 (* Implement longest match semantics for compatibility with old Str *)
315 (* General translation:
322 If r cannot match the empty string, code can be simplified:
328 let regno = allocate_register_if_nullable r in
329 let lbl1 = emit_hole() in
330 if regno >= 0 then emit_instr op_SETMARK regno;
332 if regno >= 0 then emit_instr op_CHECKPROGRESS regno;
333 emit_instr op_GOTO (displ lbl1 !progpos);
334 let lbl2 = !progpos in
335 patch_instr lbl1 op_PUSHBACK lbl2
337 (* Implement longest match semantics for compatibility with old Str *)
338 (* General translation:
345 If r cannot match the empty string, code can be simplified:
351 let regno = allocate_register_if_nullable r in
352 let lbl1 = !progpos in
354 if regno >= 0 then emit_instr op_CHECKPROGRESS regno;
355 let pos_pushback = emit_hole() in
356 if regno >= 0 then emit_instr op_SETMARK regno;
357 emit_instr op_GOTO (displ lbl1 !progpos);
358 let lbl2 = !progpos in
359 patch_instr pos_pushback op_PUSHBACK lbl2
361 (* Implement longest match semantics for compatibility with old Str *)
366 let pos_pushback = emit_hole() in
368 let lbl = !progpos in
369 patch_instr pos_pushback op_PUSHBACK lbl
371 if n >= 32 then failwith "too many \\(...\\) groups";
372 emit_instr op_BEGGROUP n;
374 emit_instr op_ENDGROUP n;
375 numgroups := max !numgroups (n+1)
377 emit_instr op_REFGROUP n
383 emit_instr op_WORDBOUNDARY 0
385 and emit_seq_code = function
387 | Star(Char _ | CharClass _ as r) :: rl
388 when disjoint_modulo_case (first r) (first_seq rl) ->
389 emit_instr op_SIMPLESTAR (cpool_index (charclass_of_regexp fold_case r));
391 | Plus(Char _ | CharClass _ as r) :: rl
392 when disjoint_modulo_case (first r) (first_seq rl) ->
393 emit_instr op_SIMPLEPLUS (cpool_index (charclass_of_regexp fold_case r));
395 | Option(Char _ | CharClass _ as r) :: rl
396 when disjoint_modulo_case (first r) (first_seq rl) ->
397 emit_instr op_SIMPLEOPT (cpool_index (charclass_of_regexp fold_case r));
403 and disjoint_modulo_case c1 c2 =
405 then Charset.disjoint (Charset.fold_case c1) (Charset.fold_case c2)
406 else Charset.disjoint c1 c2
410 emit_instr op_ACCEPT 0;
411 let start = first re in
412 let start' = if fold_case then Charset.fold_case start else start in
414 if start = Charset.full
416 else cpool_index (Charset.expand start') in
417 let constantpool = Array.make !cpoolpos "" in
418 StringMap.iter (fun str idx -> constantpool.(idx) <- str) !cpool;
419 { prog = Array.sub !prog 0 !progpos;
420 cpool = constantpool;
421 normtable = if fold_case then fold_case_table else "";
422 numgroups = !numgroups;
423 numregisters = !numregs;
424 startchars = start_pos }
426 (** Parsing of a regular expression *)
428 (* Efficient buffering of sequences *)
430 module SeqBuffer = struct
432 type t = { sb_chars: Buffer.t; mutable sb_next: re_syntax list }
434 let create() = { sb_chars = Buffer.create 16; sb_next = [] }
437 let s = Buffer.contents buf.sb_chars in
438 Buffer.clear buf.sb_chars;
439 match String.length s with
441 | 1 -> buf.sb_next <- Char s.[0] :: buf.sb_next
442 | _ -> buf.sb_next <- String s :: buf.sb_next
446 Char c -> Buffer.add_char buf.sb_chars c
447 | _ -> flush buf; buf.sb_next <- re :: buf.sb_next
450 flush buf; Seq(List.rev buf.sb_next)
454 (* The character class corresponding to `.' *)
456 let dotclass = Charset.complement (Charset.singleton '\n')
458 (* Parse a regular expression *)
461 let len = String.length s in
462 let group_counter = ref 1 in
465 let (r, j) = regexp1 i in
467 and regexp0cont r1 i =
468 if i + 2 <= len && s.[i] = '\\' && s.[i+1] = '|' then
469 let (r2, j) = regexp1 (i+2) in
470 regexp0cont (Alt(r1, r2)) j
474 regexp1cont (SeqBuffer.create()) i
475 and regexp1cont sb i =
477 || i + 2 <= len && s.[i] = '\\' && (let c = s.[i+1] in c = '|' || c = ')')
479 (SeqBuffer.extract sb, i)
481 let (r, j) = regexp2 i in
485 let (r, j) = regexp3 i in
487 and regexp2cont r i =
488 if i >= len then (r, i) else
490 '?' -> regexp2cont (Option r) (i+1)
491 | '*' -> regexp2cont (Star r) (i+1)
492 | '+' -> regexp2cont (Plus r) (i+1)
496 '\\' -> regexpbackslash (i+1)
497 | '[' -> let (c, compl, j) = regexpclass0 (i+1) in
498 (CharClass(c, compl), j)
501 | '.' -> (CharClass(dotclass, false), i+1)
503 and regexpbackslash i =
504 if i >= len then (Char '\\', i) else
509 let group_no = !group_counter in
510 if group_no < 32 then incr group_counter;
511 let (r, j) = regexp0 (i+1) in
512 if j + 1 < len && s.[j] = '\\' && s.[j+1] = ')' then
514 then (Group(group_no, r), j + 2)
517 failwith "\\( group not closed by \\)"
519 (Refgroup(Char.code c - 48), i + 1)
521 (Wordboundary, i + 1)
525 if i < len && s.[i] = '^'
526 then let (c, j) = regexpclass1 (i+1) in (c, true, j)
527 else let (c, j) = regexpclass1 i in (c, false, j)
529 let c = Charset.make_empty() in
530 let j = regexpclass2 c i i in
532 and regexpclass2 c start i =
533 if i >= len then failwith "[ class not closed by ]";
534 if s.[i] = ']' && i > start then i+1 else begin
536 if i+2 < len && s.[i+1] = '-' && s.[i+2] <> ']' then begin
538 Charset.add_range c c1 c2;
539 regexpclass2 c start (i+3)
542 regexpclass2 c start (i+1)
546 let (r, j) = regexp0 0 in
547 if j = len then r else failwith "spurious \\) in regular expression"
549 (** Parsing and compilation *)
551 let regexp e = compile false (parse e)
553 let regexp_case_fold e = compile true (parse e)
556 let len = String.length s in
557 let buf = String.create (2 * len) in
559 for i = 0 to len - 1 do
561 '[' | ']' | '*' | '.' | '\\' | '?' | '+' | '^' | '$' as c ->
562 buf.[!pos] <- '\\'; buf.[!pos + 1] <- c; pos := !pos + 2
564 buf.[!pos] <- c; pos := !pos + 1
566 String.sub buf 0 !pos
568 let regexp_string s = compile false (String s)
570 let regexp_string_case_fold s = compile true (String s)
572 (** Matching functions **)
574 external re_string_match: regexp -> string -> int -> int array
576 external re_partial_match: regexp -> string -> int -> int array
578 external re_search_forward: regexp -> string -> int -> int array
579 = "re_search_forward"
580 external re_search_backward: regexp -> string -> int -> int array
581 = "re_search_backward"
583 let last_search_result = ref [||]
585 let string_match re s pos =
586 let res = re_string_match re s pos in
587 last_search_result := res;
590 let string_partial_match re s pos =
591 let res = re_partial_match re s pos in
592 last_search_result := res;
595 let search_forward re s pos =
596 let res = re_search_forward re s pos in
597 last_search_result := res;
598 if Array.length res = 0 then raise Not_found else res.(0)
600 let search_backward re s pos =
601 let res = re_search_backward re s pos in
602 last_search_result := res;
603 if Array.length res = 0 then raise Not_found else res.(0)
605 let group_beginning n =
607 if n < 0 || n2 >= Array.length !last_search_result then
608 invalid_arg "Str.group_beginning"
610 let pos = !last_search_result.(n2) in
611 if pos = -1 then raise Not_found else pos
615 if n < 0 || n2 >= Array.length !last_search_result then
616 invalid_arg "Str.group_end"
618 let pos = !last_search_result.(n2 + 1) in
619 if pos = -1 then raise Not_found else pos
621 let matched_group n txt =
623 if n < 0 || n2 >= Array.length !last_search_result then
624 invalid_arg "Str.matched_group"
626 let b = !last_search_result.(n2)
627 and e = !last_search_result.(n2 + 1) in
628 if b = -1 then raise Not_found else String.sub txt b (e - b)
630 let match_beginning () = group_beginning 0
631 and match_end () = group_end 0
632 and matched_string txt = matched_group 0 txt
636 external re_replacement_text: string -> int array -> string -> string
637 = "re_replacement_text"
639 let replace_matched repl matched =
640 re_replacement_text repl !last_search_result matched
642 let substitute_first expr repl_fun text =
644 let pos = search_forward expr text 0 in
645 String.concat "" [string_before text pos;
647 string_after text (match_end())]
651 let opt_search_forward re s pos =
652 try Some(search_forward re s pos) with Not_found -> None
654 let global_substitute expr repl_fun text =
655 let rec replace accu start last_was_empty =
656 let startpos = if last_was_empty then start + 1 else start in
657 if startpos > String.length text then
658 string_after text start :: accu
660 match opt_search_forward expr text startpos with
662 string_after text start :: accu
664 let end_pos = match_end() in
665 let repl_text = repl_fun text in
666 replace (repl_text :: String.sub text start (pos-start) :: accu)
667 end_pos (end_pos = pos)
669 String.concat "" (List.rev (replace [] 0 false))
671 let global_replace expr repl text =
672 global_substitute expr (replace_matched repl) text
673 and replace_first expr repl text =
674 substitute_first expr (replace_matched repl) text
678 let opt_search_forward_progress expr text start =
679 match opt_search_forward expr text start with
682 if match_end() > start then
684 else if start < String.length text then
685 opt_search_forward expr text (start + 1)
688 let bounded_split expr text num =
690 if string_match expr text 0 then match_end() else 0 in
691 let rec split accu start n =
692 if start >= String.length text then accu else
693 if n = 1 then string_after text start :: accu else
694 match opt_search_forward_progress expr text start with
696 string_after text start :: accu
698 split (String.sub text start (pos-start) :: accu)
701 List.rev (split [] start num)
703 let split expr text = bounded_split expr text 0
705 let bounded_split_delim expr text num =
706 let rec split accu start n =
707 if start > String.length text then accu else
708 if n = 1 then string_after text start :: accu else
709 match opt_search_forward_progress expr text start with
711 string_after text start :: accu
713 split (String.sub text start (pos-start) :: accu)
716 if text = "" then [] else List.rev (split [] 0 num)
718 let split_delim expr text = bounded_split_delim expr text 0
720 type split_result = Text of string | Delim of string
722 let bounded_full_split expr text num =
723 let rec split accu start n =
724 if start >= String.length text then accu else
725 if n = 1 then Text(string_after text start) :: accu else
726 match opt_search_forward_progress expr text start with
728 Text(string_after text start) :: accu
730 let s = matched_string text in
732 split (Delim(s) :: Text(String.sub text start (pos-start)) :: accu)
735 split (Delim(s) :: accu)
738 List.rev (split [] 0 num)
740 let full_split expr text = bounded_full_split expr text 0