1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49 static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
58 int *p_match_first) internal_function;
59 static int check_halt_state_context (const re_match_context_t *mctx,
60 const re_dfastate_t *state, int idx)
62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
63 regmatch_t *prev_idx_match, int cur_node,
64 int cur_idx, int nmatch) internal_function;
65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
66 int str_idx, int dest_node, int nregs,
68 re_node_set *eps_via_nodes)
70 static reg_errcode_t set_regs (const regex_t *preg,
71 const re_match_context_t *mctx,
72 size_t nmatch, regmatch_t *pmatch,
73 int fl_backtrack) internal_function;
74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
78 static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 int node_idx, int str_idx, int max_str_idx)
83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
84 re_sift_context_t *sctx)
86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
87 re_sift_context_t *sctx, int str_idx,
88 re_node_set *cur_dest)
90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
91 re_sift_context_t *sctx,
93 re_node_set *dest_nodes)
95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
96 re_node_set *dest_nodes,
97 const re_node_set *candidates)
99 static int check_dst_limits (const re_match_context_t *mctx,
101 int dst_node, int dst_idx, int src_node,
102 int src_idx) internal_function;
103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
104 int boundaries, int subexp_idx,
105 int from_node, int bkref_idx)
107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
108 int limit, int subexp_idx,
109 int node, int str_idx,
110 int bkref_idx) internal_function;
111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
112 re_node_set *dest_nodes,
113 const re_node_set *candidates,
115 struct re_backref_cache_entry *bkref_ents,
116 int str_idx) internal_function;
117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, const re_node_set *candidates)
121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
123 re_dfastate_t **src, int num)
125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
126 re_match_context_t *mctx) internal_function;
127 static re_dfastate_t *transit_state (reg_errcode_t *err,
128 re_match_context_t *mctx,
129 re_dfastate_t *state) internal_function;
130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
131 re_match_context_t *mctx,
132 re_dfastate_t *next_state)
134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 re_node_set *cur_nodes,
136 int str_idx) internal_function;
138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate)
143 #ifdef RE_ENABLE_I18N
144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
145 re_dfastate_t *pstate)
148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
149 const re_node_set *nodes)
151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 int bkref_node, int bkref_str_idx)
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str)
159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
160 int subexp_idx, int type) internal_function;
161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
162 state_array_t *path, int top_node,
163 int top_str, int last_node, int last_str,
164 int type) internal_function;
165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
167 re_node_set *cur_nodes,
168 re_node_set *next_nodes)
170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
171 re_node_set *cur_nodes,
172 int ex_subexp, int type)
174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
175 re_node_set *dst_nodes,
176 int target, int ex_subexp,
177 int type) internal_function;
178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
179 re_node_set *cur_nodes, int cur_str,
180 int subexp_num, int type)
182 static int build_trtable (const re_dfa_t *dfa,
183 re_dfastate_t *state) internal_function;
184 #ifdef RE_ENABLE_I18N
185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
186 const re_string_t *input, int idx)
189 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
190 const re_dfastate_t *state,
191 re_node_set *states_node,
192 bitset_t *states_ch) internal_function;
193 static int check_node_accept (const re_match_context_t *mctx,
194 const re_token_t *node, int idx)
196 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
199 /* Entry point for POSIX code. */
201 /* regexec searches for a given pattern, specified by PREG, in the
204 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
205 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
206 least NMATCH elements, and we set them to the offsets of the
207 corresponding matched substrings.
209 EFLAGS specifies `execution flags' which affect matching: if
210 REG_NOTBOL is set, then ^ does not match at the beginning of the
211 string; if REG_NOTEOL is set, then $ does not match at the end.
213 We return 0 if we find a match and REG_NOMATCH if not. */
216 regexec (preg, string, nmatch, pmatch, eflags)
217 const regex_t *__restrict preg;
218 const char *__restrict string;
225 #ifndef __UCLIBC__ /* libc_lock_lock does not exist */
226 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
229 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
232 if (eflags & REG_STARTEND)
234 start = pmatch[0].rm_so;
235 length = pmatch[0].rm_eo;
240 length = strlen (string);
243 __libc_lock_lock (dfa->lock);
245 err = re_search_internal (preg, string, length, start, length - start,
246 length, 0, NULL, eflags);
248 err = re_search_internal (preg, string, length, start, length - start,
249 length, nmatch, pmatch, eflags);
250 __libc_lock_unlock (dfa->lock);
251 return err != REG_NOERROR;
253 libc_hidden_def(regexec)
255 /* Entry points for GNU code. */
257 /* re_match, re_search, re_match_2, re_search_2
259 The former two functions operate on STRING with length LENGTH,
260 while the later two operate on concatenation of STRING1 and STRING2
261 with lengths LENGTH1 and LENGTH2, respectively.
263 re_match() matches the compiled pattern in BUFP against the string,
264 starting at index START.
266 re_search() first tries matching at index START, then it tries to match
267 starting from index START + 1, and so on. The last start position tried
268 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
271 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
272 the first STOP characters of the concatenation of the strings should be
275 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
276 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
277 computed relative to the concatenation, not relative to the individual
280 On success, re_match* functions return the length of the match, re_search*
281 return the position of the start of the match. Return value -1 means no
282 match was found and -2 indicates an internal error. */
285 re_match (bufp, string, length, start, regs)
286 struct re_pattern_buffer *bufp;
289 struct re_registers *regs;
291 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
295 re_search (bufp, string, length, start, range, regs)
296 struct re_pattern_buffer *bufp;
298 int length, start, range;
299 struct re_registers *regs;
301 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
303 libc_hidden_def(re_search)
306 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
307 struct re_pattern_buffer *bufp;
308 const char *string1, *string2;
309 int length1, length2, start, stop;
310 struct re_registers *regs;
312 return re_search_2_stub (bufp, string1, length1, string2, length2,
313 start, 0, regs, stop, 1);
317 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
318 struct re_pattern_buffer *bufp;
319 const char *string1, *string2;
320 int length1, length2, start, range, stop;
321 struct re_registers *regs;
323 return re_search_2_stub (bufp, string1, length1, string2, length2,
324 start, range, regs, stop, 0);
326 libc_hidden_def(re_search_2)
329 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
331 struct re_pattern_buffer *bufp;
332 const char *string1, *string2;
333 int length1, length2, start, range, stop, ret_len;
334 struct re_registers *regs;
338 int len = length1 + length2;
341 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
344 /* Concatenate the strings. */
348 char *s = re_malloc (char, len);
350 if (BE (s == NULL, 0))
352 memcpy (s, string1, length1);
353 memcpy (s + length1, string2, length2);
362 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
365 re_free ((char *) str);
369 /* The parameters have the same meaning as those of re_search.
370 Additional parameters:
371 If RET_LEN is nonzero the length of the match is returned (re_match style);
372 otherwise the position of the match is returned. */
375 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
376 struct re_pattern_buffer *bufp;
378 int length, start, range, stop, ret_len;
379 struct re_registers *regs;
381 reg_errcode_t result;
385 #ifndef __UCLIBC__ /* libc_lock_lock does not exist */
386 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
389 /* Check for out-of-range. */
390 if (BE (start < 0 || start > length, 0))
392 if (BE (start + range > length, 0))
393 range = length - start;
394 else if (BE (start + range < 0, 0))
397 __libc_lock_lock (dfa->lock);
399 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
400 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
402 /* Compile fastmap if we haven't yet. */
403 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
404 re_compile_fastmap (bufp);
406 if (BE (bufp->no_sub, 0))
409 /* We need at least 1 register. */
412 else if (BE (bufp->regs_allocated == REGS_FIXED &&
413 regs->num_regs < bufp->re_nsub + 1, 0))
415 nregs = regs->num_regs;
416 if (BE (nregs < 1, 0))
418 /* Nothing can be copied to regs. */
424 nregs = bufp->re_nsub + 1;
425 pmatch = re_malloc (regmatch_t, nregs);
426 if (BE (pmatch == NULL, 0))
432 result = re_search_internal (bufp, string, length, start, range, stop,
433 nregs, pmatch, eflags);
437 /* I hope we needn't fill ther regs with -1's when no match was found. */
438 if (result != REG_NOERROR)
440 else if (regs != NULL)
442 /* If caller wants register contents data back, copy them. */
443 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
444 bufp->regs_allocated);
445 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
449 if (BE (rval == 0, 1))
453 assert (pmatch[0].rm_so == start);
454 rval = pmatch[0].rm_eo - start;
457 rval = pmatch[0].rm_so;
461 __libc_lock_unlock (dfa->lock);
466 re_copy_regs (regs, pmatch, nregs, regs_allocated)
467 struct re_registers *regs;
469 int nregs, regs_allocated;
471 int rval = REGS_REALLOCATE;
473 int need_regs = nregs + 1;
474 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
477 /* Have the register data arrays been allocated? */
478 if (regs_allocated == REGS_UNALLOCATED)
479 { /* No. So allocate them with malloc. */
480 regs->start = re_malloc (regoff_t, need_regs);
481 regs->end = re_malloc (regoff_t, need_regs);
482 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
483 return REGS_UNALLOCATED;
484 regs->num_regs = need_regs;
486 else if (regs_allocated == REGS_REALLOCATE)
487 { /* Yes. If we need more elements than were already
488 allocated, reallocate them. If we need fewer, just
490 if (BE (need_regs > regs->num_regs, 0))
492 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
493 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
494 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
495 return REGS_UNALLOCATED;
496 regs->start = new_start;
498 regs->num_regs = need_regs;
503 assert (regs_allocated == REGS_FIXED);
504 /* This function may not be called with REGS_FIXED and nregs too big. */
505 assert (regs->num_regs >= nregs);
510 for (i = 0; i < nregs; ++i)
512 regs->start[i] = pmatch[i].rm_so;
513 regs->end[i] = pmatch[i].rm_eo;
515 for ( ; i < regs->num_regs; ++i)
516 regs->start[i] = regs->end[i] = -1;
521 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
522 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
523 this memory for recording register information. STARTS and ENDS
524 must be allocated using the malloc library routine, and must each
525 be at least NUM_REGS * sizeof (regoff_t) bytes long.
527 If NUM_REGS == 0, then subsequent matches should allocate their own
530 Unless this function is called, the first search or match using
531 PATTERN_BUFFER will allocate its own register data, without
532 freeing the old data. */
535 re_set_registers (bufp, regs, num_regs, starts, ends)
536 struct re_pattern_buffer *bufp;
537 struct re_registers *regs;
539 regoff_t *starts, *ends;
543 bufp->regs_allocated = REGS_REALLOCATE;
544 regs->num_regs = num_regs;
545 regs->start = starts;
550 bufp->regs_allocated = REGS_UNALLOCATED;
552 regs->start = regs->end = (regoff_t *) 0;
556 /* Entry points compatible with 4.2 BSD regex library. We don't define
557 them unless specifically requested. */
559 #if defined _REGEX_RE_COMP || defined __UCLIBC__
562 re_exec (const char *s)
564 return 0 == regexec (re_comp_buf, s, 0, NULL, 0);
568 /* Internal entry point. */
570 /* Searches for a compiled pattern PREG in the string STRING, whose
571 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
572 mingings with regexec. START, and RANGE have the same meanings
574 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
575 otherwise return the error code.
576 Note: We assume front end functions already check ranges.
577 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
580 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
584 int length, start, range, stop, eflags;
589 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
590 int left_lim, right_lim, incr;
591 int fl_longest_match, match_first, match_kind, match_last = -1;
594 re_match_context_t mctx;
595 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
596 && range && !preg->can_be_null) ? preg->fastmap : NULL;
597 RE_TRANSLATE_TYPE t = preg->translate;
599 memset (&mctx, '\0', sizeof (re_match_context_t));
602 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
603 nmatch -= extra_nmatch;
605 /* Check if the DFA haven't been compiled. */
606 if (BE (preg->used == 0 || dfa->init_state == NULL
607 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
608 || dfa->init_state_begbuf == NULL, 0))
612 /* We assume front-end functions already check them. */
613 assert (start + range >= 0 && start + range <= length);
616 /* If initial states with non-begbuf contexts have no elements,
617 the regex must be anchored. If preg->newline_anchor is set,
618 we'll never use init_state_nl, so do not check it. */
619 if (dfa->init_state->nodes.nelem == 0
620 && dfa->init_state_word->nodes.nelem == 0
621 && (dfa->init_state_nl->nodes.nelem == 0
622 || !preg->newline_anchor))
624 if (start != 0 && start + range != 0)
629 /* We must check the longest matching, if nmatch > 0. */
630 fl_longest_match = (nmatch != 0 || dfa->nbackref);
632 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
633 preg->translate, preg->syntax & RE_ICASE, dfa);
634 if (BE (err != REG_NOERROR, 0))
636 mctx.input.stop = stop;
637 mctx.input.raw_stop = stop;
638 mctx.input.newline_anchor = preg->newline_anchor;
640 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
641 if (BE (err != REG_NOERROR, 0))
644 /* We will log all the DFA states through which the dfa pass,
645 if nmatch > 1, or this dfa has "multibyte node", which is a
646 back-reference or a node which can accept multibyte character or
647 multi character collating element. */
648 if (nmatch > 1 || dfa->has_mb_node)
650 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
651 if (BE (mctx.state_log == NULL, 0))
658 mctx.state_log = NULL;
661 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
662 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
664 /* Check incrementally whether of not the input string match. */
665 incr = (range < 0) ? -1 : 1;
666 left_lim = (range < 0) ? start + range : start;
667 right_lim = (range < 0) ? start : start + range;
668 sb = dfa->mb_cur_max == 1;
671 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
672 | (range >= 0 ? 2 : 0)
673 | (t != NULL ? 1 : 0))
676 for (;; match_first += incr)
679 if (match_first < left_lim || right_lim < match_first)
682 /* Advance as rapidly as possible through the string, until we
683 find a plausible place to start matching. This may be done
684 with varying efficiency, so there are various possibilities:
685 only the most common of them are specialized, in order to
686 save on code size. We use a switch statement for speed. */
694 /* Fastmap with single-byte translation, match forward. */
695 while (BE (match_first < right_lim, 1)
696 && !fastmap[t[(unsigned char) string[match_first]]])
698 goto forward_match_found_start_or_reached_end;
701 /* Fastmap without translation, match forward. */
702 while (BE (match_first < right_lim, 1)
703 && !fastmap[(unsigned char) string[match_first]])
706 forward_match_found_start_or_reached_end:
707 if (BE (match_first == right_lim, 0))
709 ch = match_first >= length
710 ? 0 : (unsigned char) string[match_first];
711 if (!fastmap[t ? t[ch] : ch])
718 /* Fastmap without multi-byte translation, match backwards. */
719 while (match_first >= left_lim)
721 ch = match_first >= length
722 ? 0 : (unsigned char) string[match_first];
723 if (fastmap[t ? t[ch] : ch])
727 if (match_first < left_lim)
732 /* In this case, we can't determine easily the current byte,
733 since it might be a component byte of a multibyte
734 character. Then we use the constructed buffer instead. */
737 /* If MATCH_FIRST is out of the valid range, reconstruct the
739 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
740 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
742 err = re_string_reconstruct (&mctx.input, match_first,
744 if (BE (err != REG_NOERROR, 0))
747 offset = match_first - mctx.input.raw_mbs_idx;
749 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
750 Note that MATCH_FIRST must not be smaller than 0. */
751 ch = (match_first >= length
752 ? 0 : re_string_byte_at (&mctx.input, offset));
756 if (match_first < left_lim || match_first > right_lim)
765 /* Reconstruct the buffers so that the matcher can assume that
766 the matching starts from the beginning of the buffer. */
767 err = re_string_reconstruct (&mctx.input, match_first, eflags);
768 if (BE (err != REG_NOERROR, 0))
771 #ifdef RE_ENABLE_I18N
772 /* Don't consider this char as a possible match start if it part,
773 yet isn't the head, of a multibyte character. */
774 if (!sb && !re_string_first_byte (&mctx.input, 0))
778 /* It seems to be appropriate one, then use the matcher. */
779 /* We assume that the matching starts from 0. */
780 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
781 match_last = check_matching (&mctx, fl_longest_match,
782 range >= 0 ? &match_first : NULL);
783 if (match_last != -1)
785 if (BE (match_last == -2, 0))
792 mctx.match_last = match_last;
793 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
795 re_dfastate_t *pstate = mctx.state_log[match_last];
796 mctx.last_node = check_halt_state_context (&mctx, pstate,
799 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
802 err = prune_impossible_nodes (&mctx);
803 if (err == REG_NOERROR)
805 if (BE (err != REG_NOMATCH, 0))
810 break; /* We found a match. */
814 match_ctx_clean (&mctx);
818 assert (match_last != -1);
819 assert (err == REG_NOERROR);
822 /* Set pmatch[] if we need. */
827 /* Initialize registers. */
828 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
829 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
831 /* Set the points where matching start/end. */
833 pmatch[0].rm_eo = mctx.match_last;
835 if (!preg->no_sub && nmatch > 1)
837 err = set_regs (preg, &mctx, nmatch, pmatch,
838 dfa->has_plural_match && dfa->nbackref > 0);
839 if (BE (err != REG_NOERROR, 0))
843 /* At last, add the offset to the each registers, since we slided
844 the buffers so that we could assume that the matching starts
846 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
847 if (pmatch[reg_idx].rm_so != -1)
849 #ifdef RE_ENABLE_I18N
850 if (BE (mctx.input.offsets_needed != 0, 0))
852 pmatch[reg_idx].rm_so =
853 (pmatch[reg_idx].rm_so == mctx.input.valid_len
854 ? mctx.input.valid_raw_len
855 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
856 pmatch[reg_idx].rm_eo =
857 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
858 ? mctx.input.valid_raw_len
859 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
862 assert (mctx.input.offsets_needed == 0);
864 pmatch[reg_idx].rm_so += match_first;
865 pmatch[reg_idx].rm_eo += match_first;
867 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
869 pmatch[nmatch + reg_idx].rm_so = -1;
870 pmatch[nmatch + reg_idx].rm_eo = -1;
874 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
875 if (dfa->subexp_map[reg_idx] != reg_idx)
877 pmatch[reg_idx + 1].rm_so
878 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
879 pmatch[reg_idx + 1].rm_eo
880 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
885 re_free (mctx.state_log);
887 match_ctx_free (&mctx);
888 re_string_destruct (&mctx.input);
893 prune_impossible_nodes (mctx)
894 re_match_context_t *mctx;
896 const re_dfa_t *const dfa = mctx->dfa;
897 int halt_node, match_last;
899 re_dfastate_t **sifted_states;
900 re_dfastate_t **lim_states = NULL;
901 re_sift_context_t sctx;
903 assert (mctx->state_log != NULL);
905 match_last = mctx->match_last;
906 halt_node = mctx->last_node;
907 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
908 if (BE (sifted_states == NULL, 0))
915 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
916 if (BE (lim_states == NULL, 0))
923 memset (lim_states, '\0',
924 sizeof (re_dfastate_t *) * (match_last + 1));
925 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
927 ret = sift_states_backward (mctx, &sctx);
928 re_node_set_free (&sctx.limits);
929 if (BE (ret != REG_NOERROR, 0))
931 if (sifted_states[0] != NULL || lim_states[0] != NULL)
941 } while (mctx->state_log[match_last] == NULL
942 || !mctx->state_log[match_last]->halt);
943 halt_node = check_halt_state_context (mctx,
944 mctx->state_log[match_last],
947 ret = merge_state_array (dfa, sifted_states, lim_states,
949 re_free (lim_states);
951 if (BE (ret != REG_NOERROR, 0))
956 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
957 ret = sift_states_backward (mctx, &sctx);
958 re_node_set_free (&sctx.limits);
959 if (BE (ret != REG_NOERROR, 0))
962 re_free (mctx->state_log);
963 mctx->state_log = sifted_states;
964 sifted_states = NULL;
965 mctx->last_node = halt_node;
966 mctx->match_last = match_last;
969 re_free (sifted_states);
970 re_free (lim_states);
974 /* Acquire an initial state and return it.
975 We must select appropriate initial state depending on the context,
976 since initial states may have constraints like "\<", "^", etc.. */
978 static __inline__ re_dfastate_t *
979 __attribute ((always_inline)) internal_function
980 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
983 const re_dfa_t *const dfa = mctx->dfa;
984 if (dfa->init_state->has_constraint)
986 unsigned int context;
987 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
988 if (IS_WORD_CONTEXT (context))
989 return dfa->init_state_word;
990 else if (IS_ORDINARY_CONTEXT (context))
991 return dfa->init_state;
992 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
993 return dfa->init_state_begbuf;
994 else if (IS_NEWLINE_CONTEXT (context))
995 return dfa->init_state_nl;
996 else if (IS_BEGBUF_CONTEXT (context))
998 /* It is relatively rare case, then calculate on demand. */
999 return re_acquire_state_context (err, dfa,
1000 dfa->init_state->entrance_nodes,
1004 /* Must not happen? */
1005 return dfa->init_state;
1008 return dfa->init_state;
1011 /* Check whether the regular expression match input string INPUT or not,
1012 and return the index where the matching end, return -1 if not match,
1013 or return -2 in case of an error.
1014 FL_LONGEST_MATCH means we want the POSIX longest matching.
1015 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1016 next place where we may want to try matching.
1017 Note that the matcher assume that the maching starts from the current
1018 index of the buffer. */
1022 check_matching (re_match_context_t *mctx, int fl_longest_match,
1025 const re_dfa_t *const dfa = mctx->dfa;
1028 int match_last = -1;
1029 int cur_str_idx = re_string_cur_idx (&mctx->input);
1030 re_dfastate_t *cur_state;
1031 int at_init_state = p_match_first != NULL;
1032 int next_start_idx = cur_str_idx;
1035 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1036 /* An initial state must not be NULL (invalid). */
1037 if (BE (cur_state == NULL, 0))
1039 assert (err == REG_ESPACE);
1043 if (mctx->state_log != NULL)
1045 mctx->state_log[cur_str_idx] = cur_state;
1047 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1048 later. E.g. Processing back references. */
1049 if (BE (dfa->nbackref, 0))
1052 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1053 if (BE (err != REG_NOERROR, 0))
1056 if (cur_state->has_backref)
1058 err = transit_state_bkref (mctx, &cur_state->nodes);
1059 if (BE (err != REG_NOERROR, 0))
1065 /* If the RE accepts NULL string. */
1066 if (BE (cur_state->halt, 0))
1068 if (!cur_state->has_constraint
1069 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1071 if (!fl_longest_match)
1075 match_last = cur_str_idx;
1081 while (!re_string_eoi (&mctx->input))
1083 re_dfastate_t *old_state = cur_state;
1084 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1086 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1087 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1088 && mctx->input.valid_len < mctx->input.len))
1090 err = extend_buffers (mctx);
1091 if (BE (err != REG_NOERROR, 0))
1093 assert (err == REG_ESPACE);
1098 cur_state = transit_state (&err, mctx, cur_state);
1099 if (mctx->state_log != NULL)
1100 cur_state = merge_state_with_log (&err, mctx, cur_state);
1102 if (cur_state == NULL)
1104 /* Reached the invalid state or an error. Try to recover a valid
1105 state using the state log, if available and if we have not
1106 already found a valid (even if not the longest) match. */
1107 if (BE (err != REG_NOERROR, 0))
1110 if (mctx->state_log == NULL
1111 || (match && !fl_longest_match)
1112 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1116 if (BE (at_init_state, 0))
1118 if (old_state == cur_state)
1119 next_start_idx = next_char_idx;
1124 if (cur_state->halt)
1126 /* Reached a halt state.
1127 Check the halt state can satisfy the current context. */
1128 if (!cur_state->has_constraint
1129 || check_halt_state_context (mctx, cur_state,
1130 re_string_cur_idx (&mctx->input)))
1132 /* We found an appropriate halt state. */
1133 match_last = re_string_cur_idx (&mctx->input);
1136 /* We found a match, do not modify match_first below. */
1137 p_match_first = NULL;
1138 if (!fl_longest_match)
1145 *p_match_first += next_start_idx;
1150 /* Check NODE match the current context. */
1154 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1156 re_token_type_t type = dfa->nodes[node].type;
1157 unsigned int constraint = dfa->nodes[node].constraint;
1158 if (type != END_OF_RE)
1162 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1167 /* Check the halt state STATE match the current context.
1168 Return 0 if not match, if the node, STATE has, is a halt node and
1169 match the context, return the node. */
1173 check_halt_state_context (const re_match_context_t *mctx,
1174 const re_dfastate_t *state, int idx)
1177 unsigned int context;
1179 assert (state->halt);
1181 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1182 for (i = 0; i < state->nodes.nelem; ++i)
1183 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1184 return state->nodes.elems[i];
1188 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1189 corresponding to the DFA).
1190 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1195 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1196 int *pidx, int node, re_node_set *eps_via_nodes,
1197 struct re_fail_stack_t *fs)
1199 const re_dfa_t *const dfa = mctx->dfa;
1201 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1203 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1204 re_node_set *edests = &dfa->edests[node];
1206 err = re_node_set_insert (eps_via_nodes, node);
1207 if (BE (err < 0, 0))
1209 /* Pick up a valid destination, or return -1 if none is found. */
1210 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1212 int candidate = edests->elems[i];
1213 if (!re_node_set_contains (cur_nodes, candidate))
1215 if (dest_node == -1)
1216 dest_node = candidate;
1220 /* In order to avoid infinite loop like "(a*)*", return the second
1221 epsilon-transition if the first was already considered. */
1222 if (re_node_set_contains (eps_via_nodes, dest_node))
1225 /* Otherwise, push the second epsilon-transition on the fail stack. */
1227 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1231 /* We know we are going to exit. */
1240 re_token_type_t type = dfa->nodes[node].type;
1242 #ifdef RE_ENABLE_I18N
1243 if (dfa->nodes[node].accept_mb)
1244 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1246 #endif /* RE_ENABLE_I18N */
1247 if (type == OP_BACK_REF)
1249 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1250 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1253 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1257 char *buf = (char *) re_string_get_buffer (&mctx->input);
1258 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1267 err = re_node_set_insert (eps_via_nodes, node);
1268 if (BE (err < 0, 0))
1270 dest_node = dfa->edests[node].elems[0];
1271 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1278 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1280 int dest_node = dfa->nexts[node];
1281 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1282 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1283 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1286 re_node_set_empty (eps_via_nodes);
1293 static reg_errcode_t
1295 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1296 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1299 int num = fs->num++;
1300 if (fs->num == fs->alloc)
1302 struct re_fail_stack_ent_t *new_array;
1303 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1305 if (new_array == NULL)
1308 fs->stack = new_array;
1310 fs->stack[num].idx = str_idx;
1311 fs->stack[num].node = dest_node;
1312 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1313 if (fs->stack[num].regs == NULL)
1315 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1316 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1322 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1323 regmatch_t *regs, re_node_set *eps_via_nodes)
1325 int num = --fs->num;
1327 *pidx = fs->stack[num].idx;
1328 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1329 re_node_set_free (eps_via_nodes);
1330 re_free (fs->stack[num].regs);
1331 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1332 return fs->stack[num].node;
1335 /* Set the positions where the subexpressions are starts/ends to registers
1337 Note: We assume that pmatch[0] is already set, and
1338 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1340 static reg_errcode_t
1342 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1343 regmatch_t *pmatch, int fl_backtrack)
1345 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1347 re_node_set eps_via_nodes;
1348 struct re_fail_stack_t *fs;
1349 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1350 regmatch_t *prev_idx_match;
1351 int prev_idx_match_malloced = 0;
1354 assert (nmatch > 1);
1355 assert (mctx->state_log != NULL);
1360 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1361 if (fs->stack == NULL)
1367 cur_node = dfa->init_node;
1368 re_node_set_init_empty (&eps_via_nodes);
1370 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1371 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1374 prev_idx_match = re_malloc (regmatch_t, nmatch);
1375 if (prev_idx_match == NULL)
1377 free_fail_stack_return (fs);
1380 prev_idx_match_malloced = 1;
1382 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1384 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1386 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1388 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1393 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1394 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1396 if (reg_idx == nmatch)
1398 re_node_set_free (&eps_via_nodes);
1399 if (prev_idx_match_malloced)
1400 re_free (prev_idx_match);
1401 return free_fail_stack_return (fs);
1403 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1408 re_node_set_free (&eps_via_nodes);
1409 if (prev_idx_match_malloced)
1410 re_free (prev_idx_match);
1415 /* Proceed to next node. */
1416 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1417 &eps_via_nodes, fs);
1419 if (BE (cur_node < 0, 0))
1421 if (BE (cur_node == -2, 0))
1423 re_node_set_free (&eps_via_nodes);
1424 if (prev_idx_match_malloced)
1425 re_free (prev_idx_match);
1426 free_fail_stack_return (fs);
1430 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1434 re_node_set_free (&eps_via_nodes);
1435 if (prev_idx_match_malloced)
1436 re_free (prev_idx_match);
1441 re_node_set_free (&eps_via_nodes);
1442 if (prev_idx_match_malloced)
1443 re_free (prev_idx_match);
1444 return free_fail_stack_return (fs);
1447 static reg_errcode_t
1449 free_fail_stack_return (struct re_fail_stack_t *fs)
1454 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1456 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1457 re_free (fs->stack[fs_idx].regs);
1459 re_free (fs->stack);
1466 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1467 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1469 int type = dfa->nodes[cur_node].type;
1470 if (type == OP_OPEN_SUBEXP)
1472 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1474 /* We are at the first node of this sub expression. */
1475 if (reg_num < nmatch)
1477 pmatch[reg_num].rm_so = cur_idx;
1478 pmatch[reg_num].rm_eo = -1;
1481 else if (type == OP_CLOSE_SUBEXP)
1483 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1484 if (reg_num < nmatch)
1486 /* We are at the last node of this sub expression. */
1487 if (pmatch[reg_num].rm_so < cur_idx)
1489 pmatch[reg_num].rm_eo = cur_idx;
1490 /* This is a non-empty match or we are not inside an optional
1491 subexpression. Accept this right away. */
1492 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1496 if (dfa->nodes[cur_node].opt_subexp
1497 && prev_idx_match[reg_num].rm_so != -1)
1498 /* We transited through an empty match for an optional
1499 subexpression, like (a?)*, and this is not the subexp's
1500 first match. Copy back the old content of the registers
1501 so that matches of an inner subexpression are undone as
1502 well, like in ((a?))*. */
1503 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1505 /* We completed a subexpression, but it may be part of
1506 an optional one, so do not update PREV_IDX_MATCH. */
1507 pmatch[reg_num].rm_eo = cur_idx;
1513 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1514 and sift the nodes in each states according to the following rules.
1515 Updated state_log will be wrote to STATE_LOG.
1517 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1518 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1519 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1520 the LAST_NODE, we throw away the node `a'.
1521 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1522 string `s' and transit to `b':
1523 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1525 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1526 thrown away, we throw away the node `a'.
1527 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1528 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1530 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1531 we throw away the node `a'. */
1533 #define STATE_NODE_CONTAINS(state,node) \
1534 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1536 static reg_errcode_t
1538 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1542 int str_idx = sctx->last_str_idx;
1543 re_node_set cur_dest;
1546 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1549 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1550 transit to the last_node and the last_node itself. */
1551 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1552 if (BE (err != REG_NOERROR, 0))
1554 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1555 if (BE (err != REG_NOERROR, 0))
1558 /* Then check each states in the state_log. */
1561 /* Update counters. */
1562 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1563 if (null_cnt > mctx->max_mb_elem_len)
1565 memset (sctx->sifted_states, '\0',
1566 sizeof (re_dfastate_t *) * str_idx);
1567 re_node_set_free (&cur_dest);
1570 re_node_set_empty (&cur_dest);
1573 if (mctx->state_log[str_idx])
1575 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1576 if (BE (err != REG_NOERROR, 0))
1580 /* Add all the nodes which satisfy the following conditions:
1581 - It can epsilon transit to a node in CUR_DEST.
1583 And update state_log. */
1584 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1585 if (BE (err != REG_NOERROR, 0))
1590 re_node_set_free (&cur_dest);
1594 static reg_errcode_t
1596 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1597 int str_idx, re_node_set *cur_dest)
1599 const re_dfa_t *const dfa = mctx->dfa;
1600 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1603 /* Then build the next sifted state.
1604 We build the next sifted state on `cur_dest', and update
1605 `sifted_states[str_idx]' with `cur_dest'.
1607 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1608 `cur_src' points the node_set of the old `state_log[str_idx]'
1609 (with the epsilon nodes pre-filtered out). */
1610 for (i = 0; i < cur_src->nelem; i++)
1612 int prev_node = cur_src->elems[i];
1617 re_token_type_t type = dfa->nodes[prev_node].type;
1618 assert (!IS_EPSILON_NODE (type));
1620 #ifdef RE_ENABLE_I18N
1621 /* If the node may accept `multi byte'. */
1622 if (dfa->nodes[prev_node].accept_mb)
1623 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1624 str_idx, sctx->last_str_idx);
1625 #endif /* RE_ENABLE_I18N */
1627 /* We don't check backreferences here.
1628 See update_cur_sifted_state(). */
1630 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1631 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1632 dfa->nexts[prev_node]))
1638 if (sctx->limits.nelem)
1640 int to_idx = str_idx + naccepted;
1641 if (check_dst_limits (mctx, &sctx->limits,
1642 dfa->nexts[prev_node], to_idx,
1643 prev_node, str_idx))
1646 ret = re_node_set_insert (cur_dest, prev_node);
1647 if (BE (ret == -1, 0))
1654 /* Helper functions. */
1656 static reg_errcode_t
1658 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1660 int top = mctx->state_log_top;
1662 if (next_state_log_idx >= mctx->input.bufs_len
1663 || (next_state_log_idx >= mctx->input.valid_len
1664 && mctx->input.valid_len < mctx->input.len))
1667 err = extend_buffers (mctx);
1668 if (BE (err != REG_NOERROR, 0))
1672 if (top < next_state_log_idx)
1674 memset (mctx->state_log + top + 1, '\0',
1675 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1676 mctx->state_log_top = next_state_log_idx;
1681 static reg_errcode_t
1683 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1684 re_dfastate_t **src, int num)
1688 for (st_idx = 0; st_idx < num; ++st_idx)
1690 if (dst[st_idx] == NULL)
1691 dst[st_idx] = src[st_idx];
1692 else if (src[st_idx] != NULL)
1694 re_node_set merged_set;
1695 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1696 &src[st_idx]->nodes);
1697 if (BE (err != REG_NOERROR, 0))
1699 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1700 re_node_set_free (&merged_set);
1701 if (BE (err != REG_NOERROR, 0))
1708 static reg_errcode_t
1710 update_cur_sifted_state (const re_match_context_t *mctx,
1711 re_sift_context_t *sctx, int str_idx,
1712 re_node_set *dest_nodes)
1714 const re_dfa_t *const dfa = mctx->dfa;
1715 reg_errcode_t err = REG_NOERROR;
1716 const re_node_set *candidates;
1717 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1718 : &mctx->state_log[str_idx]->nodes);
1720 if (dest_nodes->nelem == 0)
1721 sctx->sifted_states[str_idx] = NULL;
1726 /* At first, add the nodes which can epsilon transit to a node in
1728 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1729 if (BE (err != REG_NOERROR, 0))
1732 /* Then, check the limitations in the current sift_context. */
1733 if (sctx->limits.nelem)
1735 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1736 mctx->bkref_ents, str_idx);
1737 if (BE (err != REG_NOERROR, 0))
1742 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1743 if (BE (err != REG_NOERROR, 0))
1747 if (candidates && mctx->state_log[str_idx]->has_backref)
1749 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1750 if (BE (err != REG_NOERROR, 0))
1756 static reg_errcode_t
1758 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1759 const re_node_set *candidates)
1761 reg_errcode_t err = REG_NOERROR;
1764 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1765 if (BE (err != REG_NOERROR, 0))
1768 if (!state->inveclosure.alloc)
1770 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1771 if (BE (err != REG_NOERROR, 0))
1773 for (i = 0; i < dest_nodes->nelem; i++)
1774 re_node_set_merge (&state->inveclosure,
1775 dfa->inveclosures + dest_nodes->elems[i]);
1777 return re_node_set_add_intersect (dest_nodes, candidates,
1778 &state->inveclosure);
1781 static reg_errcode_t
1783 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1784 const re_node_set *candidates)
1788 re_node_set *inv_eclosure = dfa->inveclosures + node;
1789 re_node_set except_nodes;
1790 re_node_set_init_empty (&except_nodes);
1791 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1793 int cur_node = inv_eclosure->elems[ecl_idx];
1794 if (cur_node == node)
1796 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1798 int edst1 = dfa->edests[cur_node].elems[0];
1799 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1800 ? dfa->edests[cur_node].elems[1] : -1);
1801 if ((!re_node_set_contains (inv_eclosure, edst1)
1802 && re_node_set_contains (dest_nodes, edst1))
1804 && !re_node_set_contains (inv_eclosure, edst2)
1805 && re_node_set_contains (dest_nodes, edst2)))
1807 err = re_node_set_add_intersect (&except_nodes, candidates,
1808 dfa->inveclosures + cur_node);
1809 if (BE (err != REG_NOERROR, 0))
1811 re_node_set_free (&except_nodes);
1817 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1819 int cur_node = inv_eclosure->elems[ecl_idx];
1820 if (!re_node_set_contains (&except_nodes, cur_node))
1822 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1823 re_node_set_remove_at (dest_nodes, idx);
1826 re_node_set_free (&except_nodes);
1832 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1833 int dst_node, int dst_idx, int src_node, int src_idx)
1835 const re_dfa_t *const dfa = mctx->dfa;
1836 int lim_idx, src_pos, dst_pos;
1838 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1839 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1840 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1843 struct re_backref_cache_entry *ent;
1844 ent = mctx->bkref_ents + limits->elems[lim_idx];
1845 subexp_idx = dfa->nodes[ent->node].opr.idx;
1847 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1848 subexp_idx, dst_node, dst_idx,
1850 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1851 subexp_idx, src_node, src_idx,
1855 <src> <dst> ( <subexp> )
1856 ( <subexp> ) <src> <dst>
1857 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1858 if (src_pos == dst_pos)
1859 continue; /* This is unrelated limitation. */
1868 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1869 int subexp_idx, int from_node, int bkref_idx)
1871 const re_dfa_t *const dfa = mctx->dfa;
1872 const re_node_set *eclosures = dfa->eclosures + from_node;
1875 /* Else, we are on the boundary: examine the nodes on the epsilon
1877 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1879 int node = eclosures->elems[node_idx];
1880 switch (dfa->nodes[node].type)
1883 if (bkref_idx != -1)
1885 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1890 if (ent->node != node)
1893 if (subexp_idx < BITSET_WORD_BITS
1894 && !(ent->eps_reachable_subexps_map
1895 & ((bitset_word_t) 1 << subexp_idx)))
1898 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1899 OP_CLOSE_SUBEXP cases below. But, if the
1900 destination node is the same node as the source
1901 node, don't recurse because it would cause an
1902 infinite loop: a regex that exhibits this behavior
1904 dst = dfa->edests[node].elems[0];
1905 if (dst == from_node)
1909 else /* if (boundaries & 2) */
1914 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1916 if (cpos == -1 /* && (boundaries & 1) */)
1918 if (cpos == 0 && (boundaries & 2))
1921 if (subexp_idx < BITSET_WORD_BITS)
1922 ent->eps_reachable_subexps_map
1923 &= ~((bitset_word_t) 1 << subexp_idx);
1925 while (ent++->more);
1929 case OP_OPEN_SUBEXP:
1930 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1934 case OP_CLOSE_SUBEXP:
1935 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1944 return (boundaries & 2) ? 1 : 0;
1949 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
1950 int subexp_idx, int from_node, int str_idx,
1953 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1956 /* If we are outside the range of the subexpression, return -1 or 1. */
1957 if (str_idx < lim->subexp_from)
1960 if (lim->subexp_to < str_idx)
1963 /* If we are within the subexpression, return 0. */
1964 boundaries = (str_idx == lim->subexp_from);
1965 boundaries |= (str_idx == lim->subexp_to) << 1;
1966 if (boundaries == 0)
1969 /* Else, examine epsilon closure. */
1970 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1971 from_node, bkref_idx);
1974 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1975 which are against limitations from DEST_NODES. */
1977 static reg_errcode_t
1979 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1980 const re_node_set *candidates, re_node_set *limits,
1981 struct re_backref_cache_entry *bkref_ents, int str_idx)
1984 int node_idx, lim_idx;
1986 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1989 struct re_backref_cache_entry *ent;
1990 ent = bkref_ents + limits->elems[lim_idx];
1992 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1993 continue; /* This is unrelated limitation. */
1995 subexp_idx = dfa->nodes[ent->node].opr.idx;
1996 if (ent->subexp_to == str_idx)
2000 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2002 int node = dest_nodes->elems[node_idx];
2003 re_token_type_t type = dfa->nodes[node].type;
2004 if (type == OP_OPEN_SUBEXP
2005 && subexp_idx == dfa->nodes[node].opr.idx)
2007 else if (type == OP_CLOSE_SUBEXP
2008 && subexp_idx == dfa->nodes[node].opr.idx)
2012 /* Check the limitation of the open subexpression. */
2013 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2016 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2018 if (BE (err != REG_NOERROR, 0))
2022 /* Check the limitation of the close subexpression. */
2024 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2026 int node = dest_nodes->elems[node_idx];
2027 if (!re_node_set_contains (dfa->inveclosures + node,
2029 && !re_node_set_contains (dfa->eclosures + node,
2032 /* It is against this limitation.
2033 Remove it form the current sifted state. */
2034 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2036 if (BE (err != REG_NOERROR, 0))
2042 else /* (ent->subexp_to != str_idx) */
2044 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2046 int node = dest_nodes->elems[node_idx];
2047 re_token_type_t type = dfa->nodes[node].type;
2048 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2050 if (subexp_idx != dfa->nodes[node].opr.idx)
2052 /* It is against this limitation.
2053 Remove it form the current sifted state. */
2054 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2056 if (BE (err != REG_NOERROR, 0))
2065 static reg_errcode_t
2067 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2068 int str_idx, const re_node_set *candidates)
2070 const re_dfa_t *const dfa = mctx->dfa;
2073 re_sift_context_t local_sctx;
2074 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2076 if (first_idx == -1)
2079 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2081 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2084 re_token_type_t type;
2085 struct re_backref_cache_entry *entry;
2086 node = candidates->elems[node_idx];
2087 type = dfa->nodes[node].type;
2088 /* Avoid infinite loop for the REs like "()\1+". */
2089 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2091 if (type != OP_BACK_REF)
2094 entry = mctx->bkref_ents + first_idx;
2095 enabled_idx = first_idx;
2102 re_dfastate_t *cur_state;
2104 if (entry->node != node)
2106 subexp_len = entry->subexp_to - entry->subexp_from;
2107 to_idx = str_idx + subexp_len;
2108 dst_node = (subexp_len ? dfa->nexts[node]
2109 : dfa->edests[node].elems[0]);
2111 if (to_idx > sctx->last_str_idx
2112 || sctx->sifted_states[to_idx] == NULL
2113 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2114 || check_dst_limits (mctx, &sctx->limits, node,
2115 str_idx, dst_node, to_idx))
2118 if (local_sctx.sifted_states == NULL)
2121 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2122 if (BE (err != REG_NOERROR, 0))
2125 local_sctx.last_node = node;
2126 local_sctx.last_str_idx = str_idx;
2127 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2128 if (BE (ret < 0, 0))
2133 cur_state = local_sctx.sifted_states[str_idx];
2134 err = sift_states_backward (mctx, &local_sctx);
2135 if (BE (err != REG_NOERROR, 0))
2137 if (sctx->limited_states != NULL)
2139 err = merge_state_array (dfa, sctx->limited_states,
2140 local_sctx.sifted_states,
2142 if (BE (err != REG_NOERROR, 0))
2145 local_sctx.sifted_states[str_idx] = cur_state;
2146 re_node_set_remove (&local_sctx.limits, enabled_idx);
2148 /* mctx->bkref_ents may have changed, reload the pointer. */
2149 entry = mctx->bkref_ents + enabled_idx;
2151 while (enabled_idx++, entry++->more);
2155 if (local_sctx.sifted_states != NULL)
2157 re_node_set_free (&local_sctx.limits);
2164 #ifdef RE_ENABLE_I18N
2167 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2168 int node_idx, int str_idx, int max_str_idx)
2170 const re_dfa_t *const dfa = mctx->dfa;
2172 /* Check the node can accept `multi byte'. */
2173 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2174 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2175 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2176 dfa->nexts[node_idx]))
2177 /* The node can't accept the `multi byte', or the
2178 destination was already thrown away, then the node
2179 could't accept the current input `multi byte'. */
2181 /* Otherwise, it is sure that the node could accept
2182 `naccepted' bytes input. */
2185 #endif /* RE_ENABLE_I18N */
2188 /* Functions for state transition. */
2190 /* Return the next state to which the current state STATE will transit by
2191 accepting the current input byte, and update STATE_LOG if necessary.
2192 If STATE can accept a multibyte char/collating element/back reference
2193 update the destination of STATE_LOG. */
2195 static re_dfastate_t *
2197 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2198 re_dfastate_t *state)
2200 re_dfastate_t **trtable;
2203 #ifdef RE_ENABLE_I18N
2204 /* If the current state can accept multibyte. */
2205 if (BE (state->accept_mb, 0))
2207 *err = transit_state_mb (mctx, state);
2208 if (BE (*err != REG_NOERROR, 0))
2211 #endif /* RE_ENABLE_I18N */
2213 /* Then decide the next state with the single byte. */
2216 /* don't use transition table */
2217 return transit_state_sb (err, mctx, state);
2220 /* Use transition table */
2221 ch = re_string_fetch_byte (&mctx->input);
2224 trtable = state->trtable;
2225 if (BE (trtable != NULL, 1))
2228 trtable = state->word_trtable;
2229 if (BE (trtable != NULL, 1))
2231 unsigned int context;
2233 = re_string_context_at (&mctx->input,
2234 re_string_cur_idx (&mctx->input) - 1,
2236 if (IS_WORD_CONTEXT (context))
2237 return trtable[ch + SBC_MAX];
2242 if (!build_trtable (mctx->dfa, state))
2248 /* Retry, we now have a transition table. */
2252 /* Update the state_log if we need */
2255 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2256 re_dfastate_t *next_state)
2258 const re_dfa_t *const dfa = mctx->dfa;
2259 int cur_idx = re_string_cur_idx (&mctx->input);
2261 if (cur_idx > mctx->state_log_top)
2263 mctx->state_log[cur_idx] = next_state;
2264 mctx->state_log_top = cur_idx;
2266 else if (mctx->state_log[cur_idx] == 0)
2268 mctx->state_log[cur_idx] = next_state;
2272 re_dfastate_t *pstate;
2273 unsigned int context;
2274 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2275 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2276 the destination of a multibyte char/collating element/
2277 back reference. Then the next state is the union set of
2278 these destinations and the results of the transition table. */
2279 pstate = mctx->state_log[cur_idx];
2280 log_nodes = pstate->entrance_nodes;
2281 if (next_state != NULL)
2283 table_nodes = next_state->entrance_nodes;
2284 *err = re_node_set_init_union (&next_nodes, table_nodes,
2286 if (BE (*err != REG_NOERROR, 0))
2290 next_nodes = *log_nodes;
2291 /* Note: We already add the nodes of the initial state,
2292 then we don't need to add them here. */
2294 context = re_string_context_at (&mctx->input,
2295 re_string_cur_idx (&mctx->input) - 1,
2297 next_state = mctx->state_log[cur_idx]
2298 = re_acquire_state_context (err, dfa, &next_nodes, context);
2299 /* We don't need to check errors here, since the return value of
2300 this function is next_state and ERR is already set. */
2302 if (table_nodes != NULL)
2303 re_node_set_free (&next_nodes);
2306 if (BE (dfa->nbackref, 0) && next_state != NULL)
2308 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2309 later. We must check them here, since the back references in the
2310 next state might use them. */
2311 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2313 if (BE (*err != REG_NOERROR, 0))
2316 /* If the next state has back references. */
2317 if (next_state->has_backref)
2319 *err = transit_state_bkref (mctx, &next_state->nodes);
2320 if (BE (*err != REG_NOERROR, 0))
2322 next_state = mctx->state_log[cur_idx];
2329 /* Skip bytes in the input that correspond to part of a
2330 multi-byte match, then look in the log for a state
2331 from which to restart matching. */
2334 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2336 re_dfastate_t *cur_state;
2339 int max = mctx->state_log_top;
2340 int cur_str_idx = re_string_cur_idx (&mctx->input);
2344 if (++cur_str_idx > max)
2346 re_string_skip_bytes (&mctx->input, 1);
2348 while (mctx->state_log[cur_str_idx] == NULL);
2350 cur_state = merge_state_with_log (err, mctx, NULL);
2352 while (*err == REG_NOERROR && cur_state == NULL);
2356 /* Helper functions for transit_state. */
2358 /* From the node set CUR_NODES, pick up the nodes whose types are
2359 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2360 expression. And register them to use them later for evaluating the
2361 correspoding back references. */
2363 static reg_errcode_t
2365 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2368 const re_dfa_t *const dfa = mctx->dfa;
2372 /* TODO: This isn't efficient.
2373 Because there might be more than one nodes whose types are
2374 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2377 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2379 int node = cur_nodes->elems[node_idx];
2380 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2381 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2382 && (dfa->used_bkref_map
2383 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2385 err = match_ctx_add_subtop (mctx, node, str_idx);
2386 if (BE (err != REG_NOERROR, 0))
2394 /* Return the next state to which the current state STATE will transit by
2395 accepting the current input byte. */
2397 static re_dfastate_t *
2398 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2399 re_dfastate_t *state)
2401 const re_dfa_t *const dfa = mctx->dfa;
2402 re_node_set next_nodes;
2403 re_dfastate_t *next_state;
2404 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2405 unsigned int context;
2407 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2408 if (BE (*err != REG_NOERROR, 0))
2410 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2412 int cur_node = state->nodes.elems[node_cnt];
2413 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2415 *err = re_node_set_merge (&next_nodes,
2416 dfa->eclosures + dfa->nexts[cur_node]);
2417 if (BE (*err != REG_NOERROR, 0))
2419 re_node_set_free (&next_nodes);
2424 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2425 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2426 /* We don't need to check errors here, since the return value of
2427 this function is next_state and ERR is already set. */
2429 re_node_set_free (&next_nodes);
2430 re_string_skip_bytes (&mctx->input, 1);
2435 #ifdef RE_ENABLE_I18N
2436 static reg_errcode_t
2438 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2440 const re_dfa_t *const dfa = mctx->dfa;
2444 for (i = 0; i < pstate->nodes.nelem; ++i)
2446 re_node_set dest_nodes, *new_nodes;
2447 int cur_node_idx = pstate->nodes.elems[i];
2448 int naccepted, dest_idx;
2449 unsigned int context;
2450 re_dfastate_t *dest_state;
2452 if (!dfa->nodes[cur_node_idx].accept_mb)
2455 if (dfa->nodes[cur_node_idx].constraint)
2457 context = re_string_context_at (&mctx->input,
2458 re_string_cur_idx (&mctx->input),
2460 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2465 /* How many bytes the node can accept? */
2466 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2467 re_string_cur_idx (&mctx->input));
2471 /* The node can accepts `naccepted' bytes. */
2472 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2473 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2474 : mctx->max_mb_elem_len);
2475 err = clean_state_log_if_needed (mctx, dest_idx);
2476 if (BE (err != REG_NOERROR, 0))
2479 assert (dfa->nexts[cur_node_idx] != -1);
2481 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2483 dest_state = mctx->state_log[dest_idx];
2484 if (dest_state == NULL)
2485 dest_nodes = *new_nodes;
2488 err = re_node_set_init_union (&dest_nodes,
2489 dest_state->entrance_nodes, new_nodes);
2490 if (BE (err != REG_NOERROR, 0))
2493 context = re_string_context_at (&mctx->input, dest_idx - 1,
2495 mctx->state_log[dest_idx]
2496 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2497 if (dest_state != NULL)
2498 re_node_set_free (&dest_nodes);
2499 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2504 #endif /* RE_ENABLE_I18N */
2506 static reg_errcode_t
2508 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2510 const re_dfa_t *const dfa = mctx->dfa;
2513 int cur_str_idx = re_string_cur_idx (&mctx->input);
2515 for (i = 0; i < nodes->nelem; ++i)
2517 int dest_str_idx, prev_nelem, bkc_idx;
2518 int node_idx = nodes->elems[i];
2519 unsigned int context;
2520 const re_token_t *node = dfa->nodes + node_idx;
2521 re_node_set *new_dest_nodes;
2523 /* Check whether `node' is a backreference or not. */
2524 if (node->type != OP_BACK_REF)
2527 if (node->constraint)
2529 context = re_string_context_at (&mctx->input, cur_str_idx,
2531 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2535 /* `node' is a backreference.
2536 Check the substring which the substring matched. */
2537 bkc_idx = mctx->nbkref_ents;
2538 err = get_subexp (mctx, node_idx, cur_str_idx);
2539 if (BE (err != REG_NOERROR, 0))
2542 /* And add the epsilon closures (which is `new_dest_nodes') of
2543 the backreference to appropriate state_log. */
2545 assert (dfa->nexts[node_idx] != -1);
2547 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2550 re_dfastate_t *dest_state;
2551 struct re_backref_cache_entry *bkref_ent;
2552 bkref_ent = mctx->bkref_ents + bkc_idx;
2553 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2555 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2556 new_dest_nodes = (subexp_len == 0
2557 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2558 : dfa->eclosures + dfa->nexts[node_idx]);
2559 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2560 - bkref_ent->subexp_from);
2561 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2563 dest_state = mctx->state_log[dest_str_idx];
2564 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2565 : mctx->state_log[cur_str_idx]->nodes.nelem);
2566 /* Add `new_dest_node' to state_log. */
2567 if (dest_state == NULL)
2569 mctx->state_log[dest_str_idx]
2570 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2572 if (BE (mctx->state_log[dest_str_idx] == NULL
2573 && err != REG_NOERROR, 0))
2578 re_node_set dest_nodes;
2579 err = re_node_set_init_union (&dest_nodes,
2580 dest_state->entrance_nodes,
2582 if (BE (err != REG_NOERROR, 0))
2584 re_node_set_free (&dest_nodes);
2587 mctx->state_log[dest_str_idx]
2588 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2589 re_node_set_free (&dest_nodes);
2590 if (BE (mctx->state_log[dest_str_idx] == NULL
2591 && err != REG_NOERROR, 0))
2594 /* We need to check recursively if the backreference can epsilon
2597 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2599 err = check_subexp_matching_top (mctx, new_dest_nodes,
2601 if (BE (err != REG_NOERROR, 0))
2603 err = transit_state_bkref (mctx, new_dest_nodes);
2604 if (BE (err != REG_NOERROR, 0))
2614 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2615 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2616 Note that we might collect inappropriate candidates here.
2617 However, the cost of checking them strictly here is too high, then we
2618 delay these checking for prune_impossible_nodes(). */
2620 static reg_errcode_t
2622 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2624 const re_dfa_t *const dfa = mctx->dfa;
2625 int subexp_num, sub_top_idx;
2626 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2627 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2628 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2629 if (cache_idx != -1)
2631 const struct re_backref_cache_entry *entry
2632 = mctx->bkref_ents + cache_idx;
2634 if (entry->node == bkref_node)
2635 return REG_NOERROR; /* We already checked it. */
2636 while (entry++->more);
2639 subexp_num = dfa->nodes[bkref_node].opr.idx;
2641 /* For each sub expression */
2642 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2645 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2646 re_sub_match_last_t *sub_last;
2647 int sub_last_idx, sl_str, bkref_str_off;
2649 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2650 continue; /* It isn't related. */
2652 sl_str = sub_top->str_idx;
2653 bkref_str_off = bkref_str_idx;
2654 /* At first, check the last node of sub expressions we already
2656 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2659 sub_last = sub_top->lasts[sub_last_idx];
2660 sl_str_diff = sub_last->str_idx - sl_str;
2661 /* The matched string by the sub expression match with the substring
2662 at the back reference? */
2663 if (sl_str_diff > 0)
2665 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2667 /* Not enough chars for a successful match. */
2668 if (bkref_str_off + sl_str_diff > mctx->input.len)
2671 err = clean_state_log_if_needed (mctx,
2674 if (BE (err != REG_NOERROR, 0))
2676 buf = (const char *) re_string_get_buffer (&mctx->input);
2678 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2679 /* We don't need to search this sub expression any more. */
2682 bkref_str_off += sl_str_diff;
2683 sl_str += sl_str_diff;
2684 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2687 /* Reload buf, since the preceding call might have reallocated
2689 buf = (const char *) re_string_get_buffer (&mctx->input);
2691 if (err == REG_NOMATCH)
2693 if (BE (err != REG_NOERROR, 0))
2697 if (sub_last_idx < sub_top->nlasts)
2699 if (sub_last_idx > 0)
2701 /* Then, search for the other last nodes of the sub expression. */
2702 for (; sl_str <= bkref_str_idx; ++sl_str)
2704 int cls_node, sl_str_off;
2705 const re_node_set *nodes;
2706 sl_str_off = sl_str - sub_top->str_idx;
2707 /* The matched string by the sub expression match with the substring
2708 at the back reference? */
2711 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2713 /* If we are at the end of the input, we cannot match. */
2714 if (bkref_str_off >= mctx->input.len)
2717 err = extend_buffers (mctx);
2718 if (BE (err != REG_NOERROR, 0))
2721 buf = (const char *) re_string_get_buffer (&mctx->input);
2723 if (buf [bkref_str_off++] != buf[sl_str - 1])
2724 break; /* We don't need to search this sub expression
2727 if (mctx->state_log[sl_str] == NULL)
2729 /* Does this state have a ')' of the sub expression? */
2730 nodes = &mctx->state_log[sl_str]->nodes;
2731 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2735 if (sub_top->path == NULL)
2737 sub_top->path = calloc (sizeof (state_array_t),
2738 sl_str - sub_top->str_idx + 1);
2739 if (sub_top->path == NULL)
2742 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2743 in the current context? */
2744 err = check_arrival (mctx, sub_top->path, sub_top->node,
2745 sub_top->str_idx, cls_node, sl_str,
2747 if (err == REG_NOMATCH)
2749 if (BE (err != REG_NOERROR, 0))
2751 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2752 if (BE (sub_last == NULL, 0))
2754 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2756 if (err == REG_NOMATCH)
2763 /* Helper functions for get_subexp(). */
2765 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2766 If it can arrive, register the sub expression expressed with SUB_TOP
2769 static reg_errcode_t
2771 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2772 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2776 /* Can the subexpression arrive the back reference? */
2777 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2778 sub_last->str_idx, bkref_node, bkref_str,
2780 if (err != REG_NOERROR)
2782 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2784 if (BE (err != REG_NOERROR, 0))
2786 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2787 return clean_state_log_if_needed (mctx, to_idx);
2790 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2791 Search '(' if FL_OPEN, or search ')' otherwise.
2792 TODO: This function isn't efficient...
2793 Because there might be more than one nodes whose types are
2794 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2800 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2801 int subexp_idx, int type)
2804 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2806 int cls_node = nodes->elems[cls_idx];
2807 const re_token_t *node = dfa->nodes + cls_node;
2808 if (node->type == type
2809 && node->opr.idx == subexp_idx)
2815 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2816 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2818 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2820 static reg_errcode_t
2822 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2823 int top_str, int last_node, int last_str, int type)
2825 const re_dfa_t *const dfa = mctx->dfa;
2826 reg_errcode_t err = REG_NOERROR;
2827 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2828 re_dfastate_t *cur_state = NULL;
2829 re_node_set *cur_nodes, next_nodes;
2830 re_dfastate_t **backup_state_log;
2831 unsigned int context;
2833 subexp_num = dfa->nodes[top_node].opr.idx;
2834 /* Extend the buffer if we need. */
2835 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2837 re_dfastate_t **new_array;
2838 int old_alloc = path->alloc;
2839 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2840 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2841 if (BE (new_array == NULL, 0))
2843 path->alloc = old_alloc;
2846 path->array = new_array;
2847 memset (new_array + old_alloc, '\0',
2848 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2851 str_idx = path->next_idx ?: top_str;
2853 /* Temporary modify MCTX. */
2854 backup_state_log = mctx->state_log;
2855 backup_cur_idx = mctx->input.cur_idx;
2856 mctx->state_log = path->array;
2857 mctx->input.cur_idx = str_idx;
2859 /* Setup initial node set. */
2860 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2861 if (str_idx == top_str)
2863 err = re_node_set_init_1 (&next_nodes, top_node);
2864 if (BE (err != REG_NOERROR, 0))
2866 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2867 if (BE (err != REG_NOERROR, 0))
2869 re_node_set_free (&next_nodes);
2875 cur_state = mctx->state_log[str_idx];
2876 if (cur_state && cur_state->has_backref)
2878 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2879 if (BE (err != REG_NOERROR, 0))
2883 re_node_set_init_empty (&next_nodes);
2885 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2887 if (next_nodes.nelem)
2889 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2891 if (BE (err != REG_NOERROR, 0))
2893 re_node_set_free (&next_nodes);
2897 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2898 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2900 re_node_set_free (&next_nodes);
2903 mctx->state_log[str_idx] = cur_state;
2906 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2908 re_node_set_empty (&next_nodes);
2909 if (mctx->state_log[str_idx + 1])
2911 err = re_node_set_merge (&next_nodes,
2912 &mctx->state_log[str_idx + 1]->nodes);
2913 if (BE (err != REG_NOERROR, 0))
2915 re_node_set_free (&next_nodes);
2921 err = check_arrival_add_next_nodes (mctx, str_idx,
2922 &cur_state->non_eps_nodes,
2924 if (BE (err != REG_NOERROR, 0))
2926 re_node_set_free (&next_nodes);
2931 if (next_nodes.nelem)
2933 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2934 if (BE (err != REG_NOERROR, 0))
2936 re_node_set_free (&next_nodes);
2939 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2941 if (BE (err != REG_NOERROR, 0))
2943 re_node_set_free (&next_nodes);
2947 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2948 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2949 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2951 re_node_set_free (&next_nodes);
2954 mctx->state_log[str_idx] = cur_state;
2955 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2957 re_node_set_free (&next_nodes);
2958 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2959 : &mctx->state_log[last_str]->nodes);
2960 path->next_idx = str_idx;
2963 mctx->state_log = backup_state_log;
2964 mctx->input.cur_idx = backup_cur_idx;
2966 /* Then check the current node set has the node LAST_NODE. */
2967 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2973 /* Helper functions for check_arrival. */
2975 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2977 TODO: This function is similar to the functions transit_state*(),
2978 however this function has many additional works.
2979 Can't we unify them? */
2981 static reg_errcode_t
2983 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2984 re_node_set *cur_nodes, re_node_set *next_nodes)
2986 const re_dfa_t *const dfa = mctx->dfa;
2989 #ifdef RE_ENABLE_I18N
2990 reg_errcode_t err = REG_NOERROR;
2992 re_node_set union_set;
2993 re_node_set_init_empty (&union_set);
2994 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2997 int cur_node = cur_nodes->elems[cur_idx];
2999 re_token_type_t type = dfa->nodes[cur_node].type;
3000 assert (!IS_EPSILON_NODE (type));
3002 #ifdef RE_ENABLE_I18N
3003 /* If the node may accept `multi byte'. */
3004 if (dfa->nodes[cur_node].accept_mb)
3006 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3010 re_dfastate_t *dest_state;
3011 int next_node = dfa->nexts[cur_node];
3012 int next_idx = str_idx + naccepted;
3013 dest_state = mctx->state_log[next_idx];
3014 re_node_set_empty (&union_set);
3017 err = re_node_set_merge (&union_set, &dest_state->nodes);
3018 if (BE (err != REG_NOERROR, 0))
3020 re_node_set_free (&union_set);
3024 result = re_node_set_insert (&union_set, next_node);
3025 if (BE (result < 0, 0))
3027 re_node_set_free (&union_set);
3030 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3032 if (BE (mctx->state_log[next_idx] == NULL
3033 && err != REG_NOERROR, 0))
3035 re_node_set_free (&union_set);
3040 #endif /* RE_ENABLE_I18N */
3042 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3044 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3045 if (BE (result < 0, 0))
3047 re_node_set_free (&union_set);
3052 re_node_set_free (&union_set);
3056 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3057 CUR_NODES, however exclude the nodes which are:
3058 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3059 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3062 static reg_errcode_t
3064 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3065 int ex_subexp, int type)
3068 int idx, outside_node;
3069 re_node_set new_nodes;
3071 assert (cur_nodes->nelem);
3073 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3074 if (BE (err != REG_NOERROR, 0))
3076 /* Create a new node set NEW_NODES with the nodes which are epsilon
3077 closures of the node in CUR_NODES. */
3079 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3081 int cur_node = cur_nodes->elems[idx];
3082 const re_node_set *eclosure = dfa->eclosures + cur_node;
3083 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3084 if (outside_node == -1)
3086 /* There are no problematic nodes, just merge them. */
3087 err = re_node_set_merge (&new_nodes, eclosure);
3088 if (BE (err != REG_NOERROR, 0))
3090 re_node_set_free (&new_nodes);
3096 /* There are problematic nodes, re-calculate incrementally. */
3097 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3099 if (BE (err != REG_NOERROR, 0))
3101 re_node_set_free (&new_nodes);
3106 re_node_set_free (cur_nodes);
3107 *cur_nodes = new_nodes;
3111 /* Helper function for check_arrival_expand_ecl.
3112 Check incrementally the epsilon closure of TARGET, and if it isn't
3113 problematic append it to DST_NODES. */
3115 static reg_errcode_t
3117 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3118 int target, int ex_subexp, int type)
3121 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3125 if (dfa->nodes[cur_node].type == type
3126 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3128 if (type == OP_CLOSE_SUBEXP)
3130 err = re_node_set_insert (dst_nodes, cur_node);
3131 if (BE (err == -1, 0))
3136 err = re_node_set_insert (dst_nodes, cur_node);
3137 if (BE (err == -1, 0))
3139 if (dfa->edests[cur_node].nelem == 0)
3141 if (dfa->edests[cur_node].nelem == 2)
3143 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3144 dfa->edests[cur_node].elems[1],
3146 if (BE (err != REG_NOERROR, 0))
3149 cur_node = dfa->edests[cur_node].elems[0];
3155 /* For all the back references in the current state, calculate the
3156 destination of the back references by the appropriate entry
3157 in MCTX->BKREF_ENTS. */
3159 static reg_errcode_t
3161 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3162 int cur_str, int subexp_num, int type)
3164 const re_dfa_t *const dfa = mctx->dfa;
3166 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3167 struct re_backref_cache_entry *ent;
3169 if (cache_idx_start == -1)
3173 ent = mctx->bkref_ents + cache_idx_start;
3176 int to_idx, next_node;
3178 /* Is this entry ENT is appropriate? */
3179 if (!re_node_set_contains (cur_nodes, ent->node))
3182 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3183 /* Calculate the destination of the back reference, and append it
3184 to MCTX->STATE_LOG. */
3185 if (to_idx == cur_str)
3187 /* The backreference did epsilon transit, we must re-check all the
3188 node in the current state. */
3189 re_node_set new_dests;
3190 reg_errcode_t err2, err3;
3191 next_node = dfa->edests[ent->node].elems[0];
3192 if (re_node_set_contains (cur_nodes, next_node))
3194 err = re_node_set_init_1 (&new_dests, next_node);
3195 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3196 err3 = re_node_set_merge (cur_nodes, &new_dests);
3197 re_node_set_free (&new_dests);
3198 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3199 || err3 != REG_NOERROR, 0))
3201 err = (err != REG_NOERROR ? err
3202 : (err2 != REG_NOERROR ? err2 : err3));
3205 /* TODO: It is still inefficient... */
3210 re_node_set union_set;
3211 next_node = dfa->nexts[ent->node];
3212 if (mctx->state_log[to_idx])
3215 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3218 err = re_node_set_init_copy (&union_set,
3219 &mctx->state_log[to_idx]->nodes);
3220 ret = re_node_set_insert (&union_set, next_node);
3221 if (BE (err != REG_NOERROR || ret < 0, 0))
3223 re_node_set_free (&union_set);
3224 err = err != REG_NOERROR ? err : REG_ESPACE;
3230 err = re_node_set_init_1 (&union_set, next_node);
3231 if (BE (err != REG_NOERROR, 0))
3234 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3235 re_node_set_free (&union_set);
3236 if (BE (mctx->state_log[to_idx] == NULL
3237 && err != REG_NOERROR, 0))
3241 while (ent++->more);
3245 /* Build transition table for the state.
3246 Return 1 if succeeded, otherwise return NULL. */
3250 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3253 int i, j, ch, need_word_trtable = 0;
3254 bitset_word_t elem, mask;
3255 bool dests_node_malloced = false;
3256 bool dest_states_malloced = false;
3257 int ndests; /* Number of the destination states from `state'. */
3258 re_dfastate_t **trtable;
3259 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3260 re_node_set follows, *dests_node;
3262 bitset_t acceptable;
3266 re_node_set dests_node[SBC_MAX];
3267 bitset_t dests_ch[SBC_MAX];
3270 /* We build DFA states which corresponds to the destination nodes
3271 from `state'. `dests_node[i]' represents the nodes which i-th
3272 destination state contains, and `dests_ch[i]' represents the
3273 characters which i-th destination state accepts. */
3274 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3275 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3278 dests_alloc = re_malloc (struct dests_alloc, 1);
3279 if (BE (dests_alloc == NULL, 0))
3281 dests_node_malloced = true;
3283 dests_node = dests_alloc->dests_node;
3284 dests_ch = dests_alloc->dests_ch;
3286 /* Initialize transiton table. */
3287 state->word_trtable = state->trtable = NULL;
3289 /* At first, group all nodes belonging to `state' into several
3291 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3292 if (BE (ndests <= 0, 0))
3294 if (dests_node_malloced)
3296 /* Return 0 in case of an error, 1 otherwise. */
3299 state->trtable = (re_dfastate_t **)
3300 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3306 err = re_node_set_alloc (&follows, ndests + 1);
3307 if (BE (err != REG_NOERROR, 0))
3310 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3311 + ndests * 3 * sizeof (re_dfastate_t *)))
3312 dest_states = (re_dfastate_t **)
3313 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3316 dest_states = (re_dfastate_t **)
3317 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3318 if (BE (dest_states == NULL, 0))
3321 if (dest_states_malloced)
3323 re_node_set_free (&follows);
3324 for (i = 0; i < ndests; ++i)
3325 re_node_set_free (dests_node + i);
3326 if (dests_node_malloced)
3330 dest_states_malloced = true;
3332 dest_states_word = dest_states + ndests;
3333 dest_states_nl = dest_states_word + ndests;
3334 bitset_empty (acceptable);
3336 /* Then build the states for all destinations. */
3337 for (i = 0; i < ndests; ++i)
3340 re_node_set_empty (&follows);
3341 /* Merge the follows of this destination states. */
3342 for (j = 0; j < dests_node[i].nelem; ++j)
3344 next_node = dfa->nexts[dests_node[i].elems[j]];
3345 if (next_node != -1)
3347 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3348 if (BE (err != REG_NOERROR, 0))
3352 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3353 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3355 /* If the new state has context constraint,
3356 build appropriate states for these contexts. */
3357 if (dest_states[i]->has_constraint)
3359 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3361 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3364 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3365 need_word_trtable = 1;
3367 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3369 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3374 dest_states_word[i] = dest_states[i];
3375 dest_states_nl[i] = dest_states[i];
3377 bitset_merge (acceptable, dests_ch[i]);
3380 if (!BE (need_word_trtable, 0))
3382 /* We don't care about whether the following character is a word
3383 character, or we are in a single-byte character set so we can
3384 discern by looking at the character code: allocate a
3385 256-entry transition table. */
3386 trtable = state->trtable = calloc (sizeof (re_dfastate_t *), SBC_MAX);
3387 if (BE (trtable == NULL, 0))
3390 /* For all characters ch...: */
3391 for (i = 0; i < BITSET_WORDS; ++i)
3392 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3394 mask <<= 1, elem >>= 1, ++ch)
3395 if (BE (elem & 1, 0))
3397 /* There must be exactly one destination which accepts
3398 character ch. See group_nodes_into_DFAstates. */
3399 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3402 /* j-th destination accepts the word character ch. */
3403 if (dfa->word_char[i] & mask)
3404 trtable[ch] = dest_states_word[j];
3406 trtable[ch] = dest_states[j];
3411 /* We care about whether the following character is a word
3412 character, and we are in a multi-byte character set: discern
3413 by looking at the character code: build two 256-entry
3414 transition tables, one starting at trtable[0] and one
3415 starting at trtable[SBC_MAX]. */
3416 trtable = state->word_trtable = calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3417 if (BE (trtable == NULL, 0))
3420 /* For all characters ch...: */
3421 for (i = 0; i < BITSET_WORDS; ++i)
3422 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3424 mask <<= 1, elem >>= 1, ++ch)
3425 if (BE (elem & 1, 0))
3427 /* There must be exactly one destination which accepts
3428 character ch. See group_nodes_into_DFAstates. */
3429 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3432 /* j-th destination accepts the word character ch. */
3433 trtable[ch] = dest_states[j];
3434 trtable[ch + SBC_MAX] = dest_states_word[j];
3439 if (bitset_contain (acceptable, NEWLINE_CHAR))
3441 /* The current state accepts newline character. */
3442 for (j = 0; j < ndests; ++j)
3443 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3445 /* k-th destination accepts newline character. */
3446 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3447 if (need_word_trtable)
3448 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3449 /* There must be only one destination which accepts
3450 newline. See group_nodes_into_DFAstates. */
3455 if (dest_states_malloced)
3458 re_node_set_free (&follows);
3459 for (i = 0; i < ndests; ++i)
3460 re_node_set_free (dests_node + i);
3462 if (dests_node_malloced)
3468 /* Group all nodes belonging to STATE into several destinations.
3469 Then for all destinations, set the nodes belonging to the destination
3470 to DESTS_NODE[i] and set the characters accepted by the destination
3471 to DEST_CH[i]. This function return the number of destinations. */
3475 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3476 re_node_set *dests_node, bitset_t *dests_ch)
3481 int ndests; /* Number of the destinations from `state'. */
3482 bitset_t accepts; /* Characters a node can accept. */
3483 const re_node_set *cur_nodes = &state->nodes;
3484 bitset_empty (accepts);
3487 /* For all the nodes belonging to `state', */
3488 for (i = 0; i < cur_nodes->nelem; ++i)
3490 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3491 re_token_type_t type = node->type;
3492 unsigned int constraint = node->constraint;
3494 /* Enumerate all single byte character this node can accept. */
3495 if (type == CHARACTER)
3496 bitset_set (accepts, node->opr.c);
3497 else if (type == SIMPLE_BRACKET)
3499 bitset_merge (accepts, node->opr.sbcset);
3501 else if (type == OP_PERIOD)
3503 #ifdef RE_ENABLE_I18N
3504 if (dfa->mb_cur_max > 1)
3505 bitset_merge (accepts, dfa->sb_char);
3508 bitset_set_all (accepts);
3509 if (!(dfa->syntax & RE_DOT_NEWLINE))
3510 bitset_clear (accepts, '\n');
3511 if (dfa->syntax & RE_DOT_NOT_NULL)
3512 bitset_clear (accepts, '\0');
3514 #ifdef RE_ENABLE_I18N
3515 else if (type == OP_UTF8_PERIOD)
3517 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3518 if (!(dfa->syntax & RE_DOT_NEWLINE))
3519 bitset_clear (accepts, '\n');
3520 if (dfa->syntax & RE_DOT_NOT_NULL)
3521 bitset_clear (accepts, '\0');
3527 /* Check the `accepts' and sift the characters which are not
3528 match it the context. */
3531 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3533 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3534 bitset_empty (accepts);
3535 if (accepts_newline)
3536 bitset_set (accepts, NEWLINE_CHAR);
3540 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3542 bitset_empty (accepts);
3546 if (constraint & NEXT_WORD_CONSTRAINT)
3548 bitset_word_t any_set = 0;
3549 if (type == CHARACTER && !node->word_char)
3551 bitset_empty (accepts);
3554 #ifdef RE_ENABLE_I18N
3555 if (dfa->mb_cur_max > 1)
3556 for (j = 0; j < BITSET_WORDS; ++j)
3557 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3560 for (j = 0; j < BITSET_WORDS; ++j)
3561 any_set |= (accepts[j] &= dfa->word_char[j]);
3565 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3567 bitset_word_t any_set = 0;
3568 if (type == CHARACTER && node->word_char)
3570 bitset_empty (accepts);
3573 #ifdef RE_ENABLE_I18N
3574 if (dfa->mb_cur_max > 1)
3575 for (j = 0; j < BITSET_WORDS; ++j)
3576 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3579 for (j = 0; j < BITSET_WORDS; ++j)
3580 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3586 /* Then divide `accepts' into DFA states, or create a new
3587 state. Above, we make sure that accepts is not empty. */
3588 for (j = 0; j < ndests; ++j)
3590 bitset_t intersec; /* Intersection sets, see below. */
3592 /* Flags, see below. */
3593 bitset_word_t has_intersec, not_subset, not_consumed;
3595 /* Optimization, skip if this state doesn't accept the character. */
3596 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3599 /* Enumerate the intersection set of this state and `accepts'. */
3601 for (k = 0; k < BITSET_WORDS; ++k)
3602 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3603 /* And skip if the intersection set is empty. */
3607 /* Then check if this state is a subset of `accepts'. */
3608 not_subset = not_consumed = 0;
3609 for (k = 0; k < BITSET_WORDS; ++k)
3611 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3612 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3615 /* If this state isn't a subset of `accepts', create a
3616 new group state, which has the `remains'. */
3619 bitset_copy (dests_ch[ndests], remains);
3620 bitset_copy (dests_ch[j], intersec);
3621 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3622 if (BE (err != REG_NOERROR, 0))
3627 /* Put the position in the current group. */
3628 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3629 if (BE (result < 0, 0))
3632 /* If all characters are consumed, go to next node. */
3636 /* Some characters remain, create a new group. */
3639 bitset_copy (dests_ch[ndests], accepts);
3640 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3641 if (BE (err != REG_NOERROR, 0))
3644 bitset_empty (accepts);
3649 for (j = 0; j < ndests; ++j)
3650 re_node_set_free (dests_node + j);
3654 #ifdef RE_ENABLE_I18N
3655 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3656 Return the number of the bytes the node accepts.
3657 STR_IDX is the current index of the input string.
3659 This function handles the nodes which can accept one character, or
3660 one collating element like '.', '[a-z]', opposite to the other nodes
3661 can only accept one byte. */
3665 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3666 const re_string_t *input, int str_idx)
3668 const re_token_t *node = dfa->nodes + node_idx;
3669 int char_len, elem_len;
3672 if (BE (node->type == OP_UTF8_PERIOD, 0))
3674 unsigned char c = re_string_byte_at (input, str_idx), d;
3675 if (BE (c < 0xc2, 1))
3678 if (str_idx + 2 > input->len)
3681 d = re_string_byte_at (input, str_idx + 1);
3683 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3687 if (c == 0xe0 && d < 0xa0)
3693 if (c == 0xf0 && d < 0x90)
3699 if (c == 0xf8 && d < 0x88)
3705 if (c == 0xfc && d < 0x84)
3711 if (str_idx + char_len > input->len)
3714 for (i = 1; i < char_len; ++i)
3716 d = re_string_byte_at (input, str_idx + i);
3717 if (d < 0x80 || d > 0xbf)
3723 char_len = re_string_char_size_at (input, str_idx);
3724 if (node->type == OP_PERIOD)
3728 /* FIXME: I don't think this if is needed, as both '\n'
3729 and '\0' are char_len == 1. */
3730 /* '.' accepts any one character except the following two cases. */
3731 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3732 re_string_byte_at (input, str_idx) == '\n') ||
3733 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3734 re_string_byte_at (input, str_idx) == '\0'))
3739 elem_len = re_string_elem_size_at (input, str_idx);
3740 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3743 if (node->type == COMPLEX_BRACKET)
3745 const re_charset_t *cset = node->opr.mbcset;
3747 const unsigned char *pin
3748 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3753 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3754 ? re_string_wchar_at (input, str_idx) : 0);
3756 /* match with multibyte character? */
3757 for (i = 0; i < cset->nmbchars; ++i)
3758 if (wc == cset->mbchars[i])
3760 match_len = char_len;
3761 goto check_node_accept_bytes_match;
3763 /* match with character_class? */
3764 for (i = 0; i < cset->nchar_classes; ++i)
3766 wctype_t wt = cset->char_classes[i];
3767 if (__iswctype (wc, wt))
3769 match_len = char_len;
3770 goto check_node_accept_bytes_match;
3775 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3778 unsigned int in_collseq = 0;
3779 const int32_t *table, *indirect;
3780 const unsigned char *weights, *extra;
3781 const char *collseqwc;
3783 /* This #include defines a local function! */
3784 # include <locale/weight.h>
3786 /* match with collating_symbol? */
3787 if (cset->ncoll_syms)
3788 extra = (const unsigned char *)
3789 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3790 for (i = 0; i < cset->ncoll_syms; ++i)
3792 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3793 /* Compare the length of input collating element and
3794 the length of current collating element. */
3795 if (*coll_sym != elem_len)
3797 /* Compare each bytes. */
3798 for (j = 0; j < *coll_sym; j++)
3799 if (pin[j] != coll_sym[1 + j])
3803 /* Match if every bytes is equal. */
3805 goto check_node_accept_bytes_match;
3811 if (elem_len <= char_len)
3813 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3814 in_collseq = __collseq_table_lookup (collseqwc, wc);
3817 in_collseq = find_collation_sequence_value (pin, elem_len);
3819 /* match with range expression? */
3820 for (i = 0; i < cset->nranges; ++i)
3821 if (cset->range_starts[i] <= in_collseq
3822 && in_collseq <= cset->range_ends[i])
3824 match_len = elem_len;
3825 goto check_node_accept_bytes_match;
3828 /* match with equivalence_class? */
3829 if (cset->nequiv_classes)
3831 const unsigned char *cp = pin;
3832 table = (const int32_t *)
3833 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3834 weights = (const unsigned char *)
3835 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3836 extra = (const unsigned char *)
3837 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3838 indirect = (const int32_t *)
3839 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3840 idx = findidx (&cp);
3842 for (i = 0; i < cset->nequiv_classes; ++i)
3844 int32_t equiv_class_idx = cset->equiv_classes[i];
3845 size_t weight_len = weights[idx];
3846 if (weight_len == weights[equiv_class_idx])
3849 while (cnt <= weight_len
3850 && (weights[equiv_class_idx + 1 + cnt]
3851 == weights[idx + 1 + cnt]))
3853 if (cnt > weight_len)
3855 match_len = elem_len;
3856 goto check_node_accept_bytes_match;
3865 /* match with range expression? */
3868 memset (cmp_buf, 0, sizeof(cmp_buf));
3870 for (i = 0; i < cset->nranges; ++i)
3872 cmp_buf[0] = cset->range_starts[i];
3873 cmp_buf[4] = cset->range_ends[i];
3874 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3875 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3877 match_len = char_len;
3878 goto check_node_accept_bytes_match;
3883 check_node_accept_bytes_match:
3884 if (!cset->non_match)
3888 return (elem_len > char_len) ? elem_len : char_len;
3896 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3898 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3903 /* No valid character. Match it as a single byte character. */
3904 const unsigned char *collseq = (const unsigned char *)
3905 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3906 return collseq[mbs[0]];
3913 const unsigned char *extra = (const unsigned char *)
3914 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3915 int32_t extrasize = (const unsigned char *)
3916 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3918 for (idx = 0; idx < extrasize;)
3920 int mbs_cnt, found = 0;
3921 int32_t elem_mbs_len;
3922 /* Skip the name of collating element name. */
3923 idx = idx + extra[idx] + 1;
3924 elem_mbs_len = extra[idx++];
3925 if (mbs_len == elem_mbs_len)
3927 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3928 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3930 if (mbs_cnt == elem_mbs_len)
3931 /* Found the entry. */
3934 /* Skip the byte sequence of the collating element. */
3935 idx += elem_mbs_len;
3936 /* Adjust for the alignment. */
3937 idx = (idx + 3) & ~3;
3938 /* Skip the collation sequence value. */
3939 idx += sizeof (uint32_t);
3940 /* Skip the wide char sequence of the collating element. */
3941 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3942 /* If we found the entry, return the sequence value. */
3944 return *(uint32_t *) (extra + idx);
3945 /* Skip the collation sequence value. */
3946 idx += sizeof (uint32_t);
3952 #endif /* RE_ENABLE_I18N */
3954 /* Check whether the node accepts the byte which is IDX-th
3955 byte of the INPUT. */
3959 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3963 ch = re_string_byte_at (&mctx->input, idx);
3967 if (node->opr.c != ch)
3971 case SIMPLE_BRACKET:
3972 if (!bitset_contain (node->opr.sbcset, ch))
3976 #ifdef RE_ENABLE_I18N
3977 case OP_UTF8_PERIOD:
3983 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3984 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3992 if (node->constraint)
3994 /* The node has constraints. Check whether the current context
3995 satisfies the constraints. */
3996 unsigned int context = re_string_context_at (&mctx->input, idx,
3998 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4005 /* Extend the buffers, if the buffers have run out. */
4007 static reg_errcode_t
4009 extend_buffers (re_match_context_t *mctx)
4012 re_string_t *pstr = &mctx->input;
4014 /* Double the lengthes of the buffers. */
4015 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4016 if (BE (ret != REG_NOERROR, 0))
4019 if (mctx->state_log != NULL)
4021 /* And double the length of state_log. */
4022 /* XXX We have no indication of the size of this buffer. If this
4023 allocation fail we have no indication that the state_log array
4024 does not have the right size. */
4025 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4026 pstr->bufs_len + 1);
4027 if (BE (new_array == NULL, 0))
4029 mctx->state_log = new_array;
4032 /* Then reconstruct the buffers. */
4035 #ifdef RE_ENABLE_I18N
4036 if (pstr->mb_cur_max > 1)
4038 ret = build_wcs_upper_buffer (pstr);
4039 if (BE (ret != REG_NOERROR, 0))
4043 #endif /* RE_ENABLE_I18N */
4044 build_upper_buffer (pstr);
4048 #ifdef RE_ENABLE_I18N
4049 if (pstr->mb_cur_max > 1)
4050 build_wcs_buffer (pstr);
4052 #endif /* RE_ENABLE_I18N */
4054 if (pstr->trans != NULL)
4055 re_string_translate_buffer (pstr);
4062 /* Functions for matching context. */
4064 /* Initialize MCTX. */
4066 static reg_errcode_t
4068 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4070 mctx->eflags = eflags;
4071 mctx->match_last = -1;
4074 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4075 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4076 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4079 /* Already zero-ed by the caller.
4081 mctx->bkref_ents = NULL;
4082 mctx->nbkref_ents = 0;
4083 mctx->nsub_tops = 0; */
4084 mctx->abkref_ents = n;
4085 mctx->max_mb_elem_len = 1;
4086 mctx->asub_tops = n;
4090 /* Clean the entries which depend on the current input in MCTX.
4091 This function must be invoked when the matcher changes the start index
4092 of the input, or changes the input string. */
4096 match_ctx_clean (re_match_context_t *mctx)
4099 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4102 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4103 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4105 re_sub_match_last_t *last = top->lasts[sl_idx];
4106 re_free (last->path.array);
4109 re_free (top->lasts);
4112 re_free (top->path->array);
4113 re_free (top->path);
4118 mctx->nsub_tops = 0;
4119 mctx->nbkref_ents = 0;
4122 /* Free all the memory associated with MCTX. */
4126 match_ctx_free (re_match_context_t *mctx)
4128 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4129 match_ctx_clean (mctx);
4130 re_free (mctx->sub_tops);
4131 re_free (mctx->bkref_ents);
4134 /* Add a new backreference entry to MCTX.
4135 Note that we assume that caller never call this function with duplicate
4136 entry, and call with STR_IDX which isn't smaller than any existing entry.
4139 static reg_errcode_t
4141 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4144 if (mctx->nbkref_ents >= mctx->abkref_ents)
4146 struct re_backref_cache_entry* new_entry;
4147 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4148 mctx->abkref_ents * 2);
4149 if (BE (new_entry == NULL, 0))
4151 re_free (mctx->bkref_ents);
4154 mctx->bkref_ents = new_entry;
4155 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4156 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4157 mctx->abkref_ents *= 2;
4159 if (mctx->nbkref_ents > 0
4160 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4161 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4163 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4164 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4165 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4166 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4168 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4169 If bit N is clear, means that this entry won't epsilon-transition to
4170 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4171 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4174 A backreference does not epsilon-transition unless it is empty, so set
4175 to all zeros if FROM != TO. */
4176 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4177 = (from == to ? ~0 : 0);
4179 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4180 if (mctx->max_mb_elem_len < to - from)
4181 mctx->max_mb_elem_len = to - from;
4185 /* Search for the first entry which has the same str_idx, or -1 if none is
4186 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4190 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4192 int left, right, mid, last;
4193 last = right = mctx->nbkref_ents;
4194 for (left = 0; left < right;)
4196 mid = (left + right) / 2;
4197 if (mctx->bkref_ents[mid].str_idx < str_idx)
4202 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4208 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4211 static reg_errcode_t
4213 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4216 assert (mctx->sub_tops != NULL);
4217 assert (mctx->asub_tops > 0);
4219 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4221 int new_asub_tops = mctx->asub_tops * 2;
4222 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4223 re_sub_match_top_t *,
4225 if (BE (new_array == NULL, 0))
4227 mctx->sub_tops = new_array;
4228 mctx->asub_tops = new_asub_tops;
4230 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4231 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4233 mctx->sub_tops[mctx->nsub_tops]->node = node;
4234 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4238 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4239 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4241 static re_sub_match_last_t *
4243 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4245 re_sub_match_last_t *new_entry;
4246 if (BE (subtop->nlasts == subtop->alasts, 0))
4248 int new_alasts = 2 * subtop->alasts + 1;
4249 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4250 re_sub_match_last_t *,
4252 if (BE (new_array == NULL, 0))
4254 subtop->lasts = new_array;
4255 subtop->alasts = new_alasts;
4257 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4258 if (BE (new_entry != NULL, 1))
4260 subtop->lasts[subtop->nlasts] = new_entry;
4261 new_entry->node = node;
4262 new_entry->str_idx = str_idx;
4270 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4271 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4273 sctx->sifted_states = sifted_sts;
4274 sctx->limited_states = limited_sts;
4275 sctx->last_node = last_node;
4276 sctx->last_str_idx = last_str_idx;
4277 re_node_set_init_empty (&sctx->limits);