1 /***********************************************************************/
5 /* Xavier Leroy, projet Cristal, INRIA Rocquencourt */
7 /* Copyright 1996 Institut National de Recherche en Informatique et */
8 /* en Automatique. All rights reserved. This file is distributed */
9 /* under the terms of the GNU Library General Public License, with */
10 /* the special exception on linking described in file ../../LICENSE. */
12 /***********************************************************************/
14 /* $Id: strstubs.c 7064 2005-09-22 14:21:50Z xleroy $ */
23 /* The backtracking NFA interpreter */
25 union backtrack_point {
27 value * pc; /* with low bit set */
31 unsigned char ** loc; /* with low bit clear */
36 #define Set_tag(p) ((value *) ((intnat)(p) | 1))
37 #define Clear_tag(p) ((value *) ((intnat)(p) & ~1))
38 #define Tag_is_set(p) ((intnat)(p) & 1)
40 #define BACKTRACK_STACK_BLOCK_SIZE 500
42 struct backtrack_stack {
43 struct backtrack_stack * previous;
44 union backtrack_point point[BACKTRACK_STACK_BLOCK_SIZE];
47 #define Opcode(x) ((x) & 0xFF)
48 #define Arg(x) ((uintnat)(x) >> 8)
49 #define SignedArg(x) ((intnat)(x) >> 8)
52 CHAR, /* match a single character */
53 CHARNORM, /* match a single character, after normalization */
54 STRING, /* match a character string */
55 STRINGNORM, /* match a character string, after normalization */
56 CHARCLASS, /* match a character class */
57 BOL, /* match at beginning of line */
58 EOL, /* match at end of line */
59 WORDBOUNDARY, /* match on a word boundary */
60 BEGGROUP, /* record the beginning of a group */
61 ENDGROUP, /* record the end of a group */
62 REFGROUP, /* match a previously matched group */
63 ACCEPT, /* report success */
64 SIMPLEOPT, /* match a character class 0 or 1 times */
65 SIMPLESTAR, /* match a character class 0, 1 or several times */
66 SIMPLEPLUS, /* match a character class 1 or several times */
67 GOTO, /* unconditional branch */
68 PUSHBACK, /* record a backtrack point --
69 where to jump in case of failure */
70 SETMARK, /* remember current position in given register # */
71 CHECKPROGRESS /* backtrack if no progress was made w.r.t. reg # */
74 /* Accessors in a compiled regexp */
75 #define Prog(re) Field(re, 0)
76 #define Cpool(re) Field(re, 1)
77 #define Normtable(re) Field(re, 2)
78 #define Numgroups(re) Int_val(Field(re, 3))
79 #define Numregisters(re) Int_val(Field(re, 4))
80 #define Startchars(re) Int_val(Field(re, 5))
82 /* Record positions of matched groups */
85 unsigned char * start;
88 static struct re_group re_group[NUM_GROUPS];
90 /* Record positions reached during matching; used to check progress
91 in repeated matching of a regexp. */
92 #define NUM_REGISTERS 64
93 static unsigned char * re_register[NUM_REGISTERS];
95 /* The initial backtracking stack */
96 static struct backtrack_stack initial_stack = { NULL, };
98 /* Free a chained list of backtracking stacks */
99 static void free_backtrack_stack(struct backtrack_stack * stack)
101 struct backtrack_stack * prevstack;
102 while ((prevstack = stack->previous) != NULL) {
108 /* Membership in a bit vector representing a set of booleans */
109 #define In_bitset(s,i,tmp) (tmp = (i), ((s)[tmp >> 3] >> (tmp & 7)) & 1)
111 /* Determine if a character is a word constituent */
112 static unsigned char re_word_letters[32] = {
113 0, 0, 0, 0, 0, 0, 0, 0, 254, 255, 255, 7, 254, 255, 255, 7,
114 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 127, 255, 255, 255, 127, 255
116 #define Is_word_letter(c) ((re_word_letters[(c) >> 3] >> ((c) & 7)) & 1)
118 /* The bytecode interpreter for the NFA */
119 static int re_match(value re,
120 unsigned char * starttxt,
121 register unsigned char * txt,
122 register unsigned char * endtxt,
123 int accept_partial_match)
127 struct backtrack_stack * stack;
128 union backtrack_point * sp;
132 union backtrack_point back;
137 for (p = &re_group[1], i = Numgroups(re); i > 1; i--, p++)
138 p->start = p->end = NULL;
139 for (q = &re_register[0], i = Numregisters(re); i > 0; i--, q++)
143 pc = &Field(Prog(re), 0);
144 stack = &initial_stack;
147 normtable = Normtable(re);
148 re_group[0].start = txt;
151 instr = Long_val(*pc++);
152 switch (Opcode(instr)) {
154 if (txt == endtxt) goto prefix_match;
155 if (*txt != Arg(instr)) goto backtrack;
159 if (txt == endtxt) goto prefix_match;
160 if (Byte_u(normtable, *txt) != Arg(instr)) goto backtrack;
165 (unsigned char *) String_val(Field(cpool, Arg(instr)));
166 while ((c = *s++) != 0) {
167 if (txt == endtxt) goto prefix_match;
168 if (c != *txt) goto backtrack;
175 (unsigned char *) String_val(Field(cpool, Arg(instr)));
176 while ((c = *s++) != 0) {
177 if (txt == endtxt) goto prefix_match;
178 if (c != Byte_u(normtable, *txt)) goto backtrack;
184 if (txt == endtxt) goto prefix_match;
185 if (! In_bitset(String_val(Field(cpool, Arg(instr))), *txt, c))
190 if (txt > starttxt && txt[-1] != '\n') goto backtrack;
193 if (txt < endtxt && *txt != '\n') goto backtrack;
196 /* At beginning and end of text: no
197 At beginning of text: OK if current char is a letter
198 At end of text: OK if previous char is a letter
200 OK if previous char is a letter and current char not a letter
201 or previous char is not a letter and current char is a letter */
202 if (txt == starttxt) {
203 if (txt == endtxt) goto prefix_match;
204 if (Is_word_letter(txt[0])) break;
206 } else if (txt == endtxt) {
207 if (Is_word_letter(txt[-1])) break;
210 if (Is_word_letter(txt[-1]) != Is_word_letter(txt[0])) break;
214 int group_no = Arg(instr);
215 struct re_group * group = &(re_group[group_no]);
216 back.undo.loc = &(group->start);
217 back.undo.val = group->start;
222 int group_no = Arg(instr);
223 struct re_group * group = &(re_group[group_no]);
224 back.undo.loc = &(group->end);
225 back.undo.val = group->end;
230 int group_no = Arg(instr);
231 struct re_group * group = &(re_group[group_no]);
233 if (group->start == NULL || group->end == NULL) goto backtrack;
234 for (s = group->start; s < group->end; s++) {
235 if (txt == endtxt) goto prefix_match;
236 if (*s != *txt) goto backtrack;
244 char * set = String_val(Field(cpool, Arg(instr)));
245 if (txt < endtxt && In_bitset(set, *txt, c)) txt++;
249 char * set = String_val(Field(cpool, Arg(instr)));
250 while (txt < endtxt && In_bitset(set, *txt, c))
255 char * set = String_val(Field(cpool, Arg(instr)));
256 if (txt == endtxt) goto prefix_match;
257 if (! In_bitset(set, *txt, c)) goto backtrack;
259 while (txt < endtxt && In_bitset(set, *txt, c))
264 pc = pc + SignedArg(instr);
267 back.pos.pc = Set_tag(pc + SignedArg(instr));
271 int reg_no = Arg(instr);
272 unsigned char ** reg = &(re_register[reg_no]);
274 back.undo.val = *reg;
278 case CHECKPROGRESS: {
279 int reg_no = Arg(instr);
280 if (re_register[reg_no] == txt)
285 caml_fatal_error ("impossible case in re_match");
287 /* Continue with next instruction */
291 /* Push an item on the backtrack stack and continue with next instr */
292 if (sp == stack->point + BACKTRACK_STACK_BLOCK_SIZE) {
293 struct backtrack_stack * newstack =
294 stat_alloc(sizeof(struct backtrack_stack));
295 newstack->previous = stack;
304 /* We get here when matching failed because the end of text
306 if (accept_partial_match) goto accept;
309 /* We get here when matching fails. Backtrack to most recent saved
310 program point, undoing variable assignments on the way. */
312 if (sp == stack->point) {
313 struct backtrack_stack * prevstack = stack->previous;
314 if (prevstack == NULL) return 0;
317 sp = stack->point + BACKTRACK_STACK_BLOCK_SIZE;
320 if (Tag_is_set(sp->pos.pc)) {
321 pc = Clear_tag(sp->pos.pc);
325 *(sp->undo.loc) = sp->undo.val;
332 /* We get here when the regexp was successfully matched */
333 free_backtrack_stack(stack);
334 re_group[0].end = txt;
338 /* Allocate an integer array containing the positions of the matched groups.
339 Beginning of group #N is at 2N, end is at 2N+1.
340 Take position = -1 when group wasn't matched. */
342 static value re_alloc_groups(value re, value str)
346 unsigned char * starttxt = (unsigned char *) String_val(str);
347 int n = Numgroups(re);
349 struct re_group * group;
351 res = alloc(n * 2, 0);
352 for (i = 0; i < n; i++) {
353 group = &(re_group[i]);
354 if (group->start == NULL || group->end == NULL) {
355 Field(res, i * 2) = Val_int(-1);
356 Field(res, i * 2 + 1) = Val_int(-1);
358 Field(res, i * 2) = Val_long(group->start - starttxt);
359 Field(res, i * 2 + 1) = Val_long(group->end - starttxt);
365 /* String matching and searching. All functions return the empty array
366 on failure, and an array of positions on success. */
368 CAMLprim value re_string_match(value re, value str, value pos)
370 unsigned char * starttxt = &Byte_u(str, 0);
371 unsigned char * txt = &Byte_u(str, Long_val(pos));
372 unsigned char * endtxt = &Byte_u(str, string_length(str));
374 if (txt < starttxt || txt > endtxt)
375 invalid_argument("Str.string_match");
376 if (re_match(re, starttxt, txt, endtxt, 0)) {
377 return re_alloc_groups(re, str);
383 CAMLprim value re_partial_match(value re, value str, value pos)
385 unsigned char * starttxt = &Byte_u(str, 0);
386 unsigned char * txt = &Byte_u(str, Long_val(pos));
387 unsigned char * endtxt = &Byte_u(str, string_length(str));
389 if (txt < starttxt || txt > endtxt)
390 invalid_argument("Str.string_partial_match");
391 if (re_match(re, starttxt, txt, endtxt, 1)) {
392 return re_alloc_groups(re, str);
398 CAMLprim value re_search_forward(value re, value str, value startpos)
400 unsigned char * starttxt = &Byte_u(str, 0);
401 unsigned char * txt = &Byte_u(str, Long_val(startpos));
402 unsigned char * endtxt = &Byte_u(str, string_length(str));
403 unsigned char * startchars;
405 if (txt < starttxt || txt > endtxt)
406 invalid_argument("Str.search_forward");
407 if (Startchars(re) == -1) {
409 if (re_match(re, starttxt, txt, endtxt, 0))
410 return re_alloc_groups(re, str);
412 } while (txt <= endtxt);
416 (unsigned char *) String_val(Field(Cpool(re), Startchars(re)));
418 while (txt < endtxt && startchars[*txt] == 0) txt++;
419 if (re_match(re, starttxt, txt, endtxt, 0))
420 return re_alloc_groups(re, str);
422 } while (txt <= endtxt);
427 CAMLprim value re_search_backward(value re, value str, value startpos)
429 unsigned char * starttxt = &Byte_u(str, 0);
430 unsigned char * txt = &Byte_u(str, Long_val(startpos));
431 unsigned char * endtxt = &Byte_u(str, string_length(str));
432 unsigned char * startchars;
434 if (txt < starttxt || txt > endtxt)
435 invalid_argument("Str.search_backward");
436 if (Startchars(re) == -1) {
438 if (re_match(re, starttxt, txt, endtxt, 0))
439 return re_alloc_groups(re, str);
441 } while (txt >= starttxt);
445 (unsigned char *) String_val(Field(Cpool(re), Startchars(re)));
447 while (txt > starttxt && startchars[*txt] == 0) txt--;
448 if (re_match(re, starttxt, txt, endtxt, 0))
449 return re_alloc_groups(re, str);
451 } while (txt >= starttxt);
458 CAMLprim value re_replacement_text(value repl, value groups, value orig)
460 CAMLparam3(repl, groups, orig);
462 mlsize_t start, end, len, n;
467 p = String_val(repl);
468 n = string_length(repl);
474 if (n == 0) failwith("Str.replace: illegal backslash sequence");
479 case '0': case '1': case '2': case '3': case '4':
480 case '5': case '6': case '7': case '8': case '9':
482 if (c*2 >= Wosize_val(groups))
483 failwith("Str.replace: reference to unmatched group");
484 start = Long_val(Field(groups, c*2));
485 end = Long_val(Field(groups, c*2 + 1));
486 if (start == (mlsize_t) -1)
487 failwith("Str.replace: reference to unmatched group");
495 res = alloc_string(len);
496 p = String_val(repl);
498 n = string_length(repl);
508 case '0': case '1': case '2': case '3': case '4':
509 case '5': case '6': case '7': case '8': case '9':
511 start = Long_val(Field(groups, c*2));
512 end = Long_val(Field(groups, c*2 + 1));
514 memmove (q, &Byte(orig, start), len);
518 *q++ = '\\'; *q++ = c; break;