12 #define accept regaccept
14 #define nelem(a) (sizeof (a) / sizeof (a)[0])
17 #define MAXTHREAD 1000
18 #define MAXSUB REG_MAXSUB
20 typedef struct Reclass Reclass;
21 typedef struct Renode Renode;
22 typedef struct Reinst Reinst;
23 typedef struct Rethread Rethread;
39 Renode *pstart, *pend;
55 static void die(struct cstate *g, const char *message)
58 longjmp(g->kaboom, 1);
61 static int canon(Rune c)
63 Rune u = toupperrune(c);
64 if (c >= 128 && u < 128)
73 L_CCLASS, /* character class */
74 L_NCCLASS, /* negative character class */
75 L_NC, /* "(?:" no capture */
76 L_PLA, /* "(?=" positive lookahead */
77 L_NLA, /* "(?!" negative lookahead */
78 L_WORD, /* "\b" word boundary */
79 L_NWORD, /* "\B" non-word boundary */
80 L_REF, /* "\1" back-reference */
84 static int hex(struct cstate *g, int c)
86 if (c >= '0' && c <= '9') return c - '0';
87 if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
88 if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
89 die(g, "invalid escape sequence");
93 static int dec(struct cstate *g, int c)
95 if (c >= '0' && c <= '9') return c - '0';
96 die(g, "invalid quantifier");
100 #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
102 static int isunicodeletter(int c)
104 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
107 static int nextrune(struct cstate *g)
109 g->source += chartorune(&g->yychar, g->source);
110 if (g->yychar == '\\') {
111 g->source += chartorune(&g->yychar, g->source);
113 case 0: die(g, "unterminated escape sequence");
114 case 'f': g->yychar = '\f'; return 0;
115 case 'n': g->yychar = '\n'; return 0;
116 case 'r': g->yychar = '\r'; return 0;
117 case 't': g->yychar = '\t'; return 0;
118 case 'v': g->yychar = '\v'; return 0;
120 g->yychar = (*g->source++) & 31;
123 g->yychar = hex(g, *g->source++) << 4;
124 g->yychar += hex(g, *g->source++);
125 if (g->yychar == 0) {
131 g->yychar = hex(g, *g->source++) << 12;
132 g->yychar += hex(g, *g->source++) << 8;
133 g->yychar += hex(g, *g->source++) << 4;
134 g->yychar += hex(g, *g->source++);
135 if (g->yychar == 0) {
141 if (strchr(ESCAPES, g->yychar))
143 if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
144 die(g, "invalid escape character");
150 static int lexcount(struct cstate *g)
152 g->yychar = *g->source++;
154 g->yymin = dec(g, g->yychar);
155 g->yychar = *g->source++;
156 while (g->yychar != ',' && g->yychar != '}') {
157 g->yymin = g->yymin * 10 + dec(g, g->yychar);
158 g->yychar = *g->source++;
160 if (g->yymin >= REPINF)
161 die(g, "numeric overflow");
163 if (g->yychar == ',') {
164 g->yychar = *g->source++;
165 if (g->yychar == '}') {
168 g->yymax = dec(g, g->yychar);
169 g->yychar = *g->source++;
170 while (g->yychar != '}') {
171 g->yymax = g->yymax * 10 + dec(g, g->yychar);
172 g->yychar = *g->source++;
174 if (g->yymax >= REPINF)
175 die(g, "numeric overflow");
184 static void newcclass(struct cstate *g)
186 if (g->ncclass >= nelem(g->prog->cclass))
187 die(g, "too many character classes");
188 g->yycc = g->prog->cclass + g->ncclass++;
189 g->yycc->end = g->yycc->spans;
192 static void addrange(struct cstate *g, Rune a, Rune b)
195 die(g, "invalid character class range");
196 if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
197 die(g, "too many character class ranges");
202 static void addranges_d(struct cstate *g)
204 addrange(g, '0', '9');
207 static void addranges_D(struct cstate *g)
209 addrange(g, 0, '0'-1);
210 addrange(g, '9'+1, 0xFFFF);
213 static void addranges_s(struct cstate *g)
215 addrange(g, 0x9, 0x9);
216 addrange(g, 0xA, 0xD);
217 addrange(g, 0x20, 0x20);
218 addrange(g, 0xA0, 0xA0);
219 addrange(g, 0x2028, 0x2029);
220 addrange(g, 0xFEFF, 0xFEFF);
223 static void addranges_S(struct cstate *g)
225 addrange(g, 0, 0x9-1);
226 addrange(g, 0x9+1, 0xA-1);
227 addrange(g, 0xD+1, 0x20-1);
228 addrange(g, 0x20+1, 0xA0-1);
229 addrange(g, 0xA0+1, 0x2028-1);
230 addrange(g, 0x2029+1, 0xFEFF-1);
231 addrange(g, 0xFEFF+1, 0xFFFF);
234 static void addranges_w(struct cstate *g)
236 addrange(g, '0', '9');
237 addrange(g, 'A', 'Z');
238 addrange(g, '_', '_');
239 addrange(g, 'a', 'z');
242 static void addranges_W(struct cstate *g)
244 addrange(g, 0, '0'-1);
245 addrange(g, '9'+1, 'A'-1);
246 addrange(g, 'Z'+1, '_'-1);
247 addrange(g, '_'+1, 'a'-1);
248 addrange(g, 'z'+1, 0xFFFF);
251 static int lexclass(struct cstate *g)
254 int quoted, havesave, havedash;
259 quoted = nextrune(g);
260 if (!quoted && g->yychar == '^') {
262 quoted = nextrune(g);
265 havesave = havedash = 0;
268 die(g, "unterminated character class");
269 if (!quoted && g->yychar == ']')
272 if (!quoted && g->yychar == '-') {
275 addrange(g, save, '-');
276 havesave = havedash = 0;
284 } else if (quoted && strchr("DSWdsw", g->yychar)) {
286 addrange(g, save, save);
288 addrange(g, '-', '-');
291 case 'd': addranges_d(g); break;
292 case 's': addranges_s(g); break;
293 case 'w': addranges_w(g); break;
294 case 'D': addranges_D(g); break;
295 case 'S': addranges_S(g); break;
296 case 'W': addranges_W(g); break;
298 havesave = havedash = 0;
301 if (g->yychar == 'b')
303 else if (g->yychar == '0')
305 /* else identity escape */
309 addrange(g, save, g->yychar);
310 havesave = havedash = 0;
312 addrange(g, save, save);
321 quoted = nextrune(g);
325 addrange(g, save, save);
327 addrange(g, '-', '-');
333 static int lex(struct cstate *g)
335 int quoted = nextrune(g);
338 case 'b': return L_WORD;
339 case 'B': return L_NWORD;
340 case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
341 case 's': newcclass(g); addranges_s(g); return L_CCLASS;
342 case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
343 case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
344 case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
345 case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
346 case '0': g->yychar = 0; return L_CHAR;
348 if (g->yychar >= '0' && g->yychar <= '9') {
350 if (*g->source >= '0' && *g->source <= '9')
351 g->yychar = g->yychar * 10 + *g->source++ - '0';
359 case '$': case ')': case '*': case '+':
360 case '.': case '?': case '^': case '|':
364 if (g->yychar == '{')
366 if (g->yychar == '[')
368 if (g->yychar == '(') {
369 if (g->source[0] == '?') {
370 if (g->source[1] == ':') {
374 if (g->source[1] == '=') {
378 if (g->source[1] == '!') {
393 P_BOL, P_EOL, P_WORD, P_NWORD,
395 P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
401 unsigned char ng, m, n;
408 static Renode *newnode(struct cstate *g, int type)
410 Renode *node = g->pend++;
417 node->x = node->y = NULL;
421 static int empty(Renode *node)
424 switch (node->type) {
426 case P_CAT: return empty(node->x) && empty(node->y);
427 case P_ALT: return empty(node->x) || empty(node->y);
428 case P_REP: return empty(node->x) || node->m == 0;
429 case P_PAR: return empty(node->x);
430 case P_REF: return empty(node->x);
431 case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
435 static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
437 Renode *rep = newnode(g, P_REP);
438 if (max == REPINF && empty(atom))
439 die(g, "infinite loop matching the empty string");
447 static void next(struct cstate *g)
449 g->lookahead = lex(g);
452 static int accept(struct cstate *g, int t)
454 if (g->lookahead == t) {
461 static Renode *parsealt(struct cstate *g);
463 static Renode *parseatom(struct cstate *g)
466 if (g->lookahead == L_CHAR) {
467 atom = newnode(g, P_CHAR);
472 if (g->lookahead == L_CCLASS) {
473 atom = newnode(g, P_CCLASS);
478 if (g->lookahead == L_NCCLASS) {
479 atom = newnode(g, P_NCCLASS);
484 if (g->lookahead == L_REF) {
485 atom = newnode(g, P_REF);
486 if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
487 die(g, "invalid back-reference");
489 atom->x = g->sub[g->yychar];
494 return newnode(g, P_ANY);
495 if (accept(g, '(')) {
496 atom = newnode(g, P_PAR);
497 if (g->nsub == MAXSUB)
498 die(g, "too many captures");
500 atom->x = parsealt(g);
501 g->sub[atom->n] = atom;
503 die(g, "unmatched '('");
506 if (accept(g, L_NC)) {
509 die(g, "unmatched '('");
512 if (accept(g, L_PLA)) {
513 atom = newnode(g, P_PLA);
514 atom->x = parsealt(g);
516 die(g, "unmatched '('");
519 if (accept(g, L_NLA)) {
520 atom = newnode(g, P_NLA);
521 atom->x = parsealt(g);
523 die(g, "unmatched '('");
526 die(g, "syntax error");
530 static Renode *parserep(struct cstate *g)
534 if (accept(g, '^')) return newnode(g, P_BOL);
535 if (accept(g, '$')) return newnode(g, P_EOL);
536 if (accept(g, L_WORD)) return newnode(g, P_WORD);
537 if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
540 if (g->lookahead == L_COUNT) {
541 int min = g->yymin, max = g->yymax;
544 die(g, "invalid quantifier");
545 return newrep(g, atom, accept(g, '?'), min, max);
547 if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
548 if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
549 if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
553 static Renode *parsecat(struct cstate *g)
556 if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
558 while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
560 cat = newnode(g, P_CAT);
562 cat->y = parserep(g);
569 static Renode *parsealt(struct cstate *g)
573 while (accept(g, '|')) {
575 alt = newnode(g, P_ALT);
577 alt->y = parsecat(g);
585 I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
586 I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
587 I_BOL, I_EOL, I_WORD, I_NWORD,
592 unsigned char opcode;
600 static unsigned int count(Renode *node)
602 unsigned int min, max;
604 switch (node->type) {
606 case P_CAT: return count(node->x) + count(node->y);
607 case P_ALT: return count(node->x) + count(node->y) + 2;
611 if (min == max) return count(node->x) * min;
612 if (max < REPINF) return count(node->x) * max + (max - min);
613 return count(node->x) * (min + 1) + 2;
614 case P_PAR: return count(node->x) + 2;
615 case P_PLA: return count(node->x) + 2;
616 case P_NLA: return count(node->x) + 2;
620 static Reinst *emit(Reprog *prog, int opcode)
622 Reinst *inst = prog->end++;
623 inst->opcode = opcode;
627 inst->x = inst->y = NULL;
631 static void compile(Reprog *prog, Renode *node)
633 Reinst *inst, *split, *jump;
639 switch (node->type) {
641 compile(prog, node->x);
642 compile(prog, node->y);
646 split = emit(prog, I_SPLIT);
647 compile(prog, node->x);
648 jump = emit(prog, I_JUMP);
649 compile(prog, node->y);
650 split->x = split + 1;
656 for (i = 0; i < node->m; ++i) {
658 compile(prog, node->x);
660 if (node->m == node->n)
662 if (node->n < REPINF) {
663 for (i = node->m; i < node->n; ++i) {
664 split = emit(prog, I_SPLIT);
665 compile(prog, node->x);
667 split->y = split + 1;
668 split->x = prog->end;
670 split->x = split + 1;
671 split->y = prog->end;
674 } else if (node->m == 0) {
675 split = emit(prog, I_SPLIT);
676 compile(prog, node->x);
677 jump = emit(prog, I_JUMP);
679 split->y = split + 1;
680 split->x = prog->end;
682 split->x = split + 1;
683 split->y = prog->end;
687 split = emit(prog, I_SPLIT);
690 split->x = prog->end;
693 split->y = prog->end;
698 case P_BOL: emit(prog, I_BOL); break;
699 case P_EOL: emit(prog, I_EOL); break;
700 case P_WORD: emit(prog, I_WORD); break;
701 case P_NWORD: emit(prog, I_NWORD); break;
704 inst = emit(prog, I_LPAR);
706 compile(prog, node->x);
707 inst = emit(prog, I_RPAR);
711 split = emit(prog, I_PLA);
712 compile(prog, node->x);
714 split->x = split + 1;
715 split->y = prog->end;
718 split = emit(prog, I_NLA);
719 compile(prog, node->x);
721 split->x = split + 1;
722 split->y = prog->end;
729 inst = emit(prog, I_CHAR);
730 inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
733 inst = emit(prog, I_CCLASS);
737 inst = emit(prog, I_NCCLASS);
741 inst = emit(prog, I_REF);
748 static void dumpnode(Renode *node)
751 if (!node) { printf("Empty"); return; }
752 switch (node->type) {
753 case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
754 case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
756 printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
760 case P_BOL: printf("Bol"); break;
761 case P_EOL: printf("Eol"); break;
762 case P_WORD: printf("Word"); break;
763 case P_NWORD: printf("NotWord"); break;
764 case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
765 case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
766 case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
767 case P_ANY: printf("Any"); break;
768 case P_CHAR: printf("Char(%c)", node->c); break;
771 for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
776 for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
779 case P_REF: printf("Ref(%d)", node->n); break;
783 static void dumpprog(Reprog *prog)
787 for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
789 switch (inst->opcode) {
790 case I_END: puts("end"); break;
791 case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
792 case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
793 case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
794 case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
795 case I_ANY: puts("any"); break;
796 case I_ANYNL: puts("anynl"); break;
797 case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
798 case I_CCLASS: puts("cclass"); break;
799 case I_NCCLASS: puts("ncclass"); break;
800 case I_REF: printf("ref %d\n", inst->n); break;
801 case I_BOL: puts("bol"); break;
802 case I_EOL: puts("eol"); break;
803 case I_WORD: puts("word"); break;
804 case I_NWORD: puts("nword"); break;
805 case I_LPAR: printf("lpar %d\n", inst->n); break;
806 case I_RPAR: printf("rpar %d\n", inst->n); break;
812 Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
816 Reinst *split, *jump;
819 g.prog = malloc(sizeof (Reprog));
820 g.pstart = g.pend = malloc(sizeof (Renode) * strlen(pattern) * 2);
822 if (setjmp(g.kaboom)) {
823 if (errorp) *errorp = g.error;
832 for (i = 0; i < MAXSUB; ++i)
835 g.prog->flags = cflags;
839 if (g.lookahead == ')')
840 die(&g, "unmatched ')'");
841 if (g.lookahead != 0)
842 die(&g, "syntax error");
844 g.prog->nsub = g.nsub;
845 g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst));
847 split = emit(g.prog, I_SPLIT);
848 split->x = split + 3;
849 split->y = split + 1;
850 emit(g.prog, I_ANYNL);
851 jump = emit(g.prog, I_JUMP);
853 emit(g.prog, I_LPAR);
854 compile(g.prog, node);
855 emit(g.prog, I_RPAR);
866 if (errorp) *errorp = NULL;
870 void regfree(Reprog *prog)
880 static int isnewline(int c)
882 return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
885 static int iswordchar(int c)
888 (c >= 'a' && c <= 'z') ||
889 (c >= 'A' && c <= 'Z') ||
890 (c >= '0' && c <= '9');
893 static int incclass(Reclass *cc, Rune c)
896 for (p = cc->spans; p < cc->end; p += 2)
897 if (p[0] <= c && c <= p[1])
902 static int incclasscanon(Reclass *cc, Rune c)
905 for (p = cc->spans; p < cc->end; p += 2)
906 for (r = p[0]; r <= p[1]; ++r)
912 static int strncmpcanon(const char *a, const char *b, unsigned int n)
919 a += chartorune(&ra, a);
920 b += chartorune(&rb, b);
921 c = canon(ra) - canon(rb);
934 static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
938 memcpy(&t->sub, sub, sizeof t->sub);
941 static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
943 Rethread ready[MAXTHREAD];
950 /* queue initial thread */
951 spawn(ready + 0, pc, sp, out);
954 /* run threads in stack order */
957 pc = ready[nready].pc;
958 sp = ready[nready].sp;
959 memcpy(&sub, &ready[nready].sub, sizeof sub);
961 switch (pc->opcode) {
963 for (i = 0; i < MAXSUB; ++i) {
964 out->sub[i].sp = sub.sub[i].sp;
965 out->sub[i].ep = sub.sub[i].ep;
972 if (nready >= MAXTHREAD) {
973 fprintf(stderr, "regexec: backtrack overflow!\n");
976 spawn(&ready[nready++], pc->y, sp, &sub);
981 if (!match(pc->x, sp, bol, flags, &sub))
986 memcpy(&scratch, &sub, sizeof scratch);
987 if (match(pc->x, sp, bol, flags, &scratch))
993 sp += chartorune(&c, sp);
998 sp += chartorune(&c, sp);
1005 sp += chartorune(&c, sp);
1008 if (flags & REG_ICASE)
1014 sp += chartorune(&c, sp);
1017 if (flags & REG_ICASE) {
1018 if (!incclasscanon(pc->cc, canon(c)))
1021 if (!incclass(pc->cc, c))
1026 sp += chartorune(&c, sp);
1029 if (flags & REG_ICASE) {
1030 if (incclasscanon(pc->cc, canon(c)))
1033 if (incclass(pc->cc, c))
1038 i = sub.sub[pc->n].ep - sub.sub[pc->n].sp;
1039 if (flags & REG_ICASE) {
1040 if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
1043 if (strncmp(sp, sub.sub[pc->n].sp, i))
1051 if (sp == bol && !(flags & REG_NOTBOL))
1053 if (flags & REG_NEWLINE)
1054 if (sp > bol && isnewline(sp[-1]))
1060 if (flags & REG_NEWLINE)
1065 i = sp > bol && iswordchar(sp[-1]);
1066 i ^= iswordchar(sp[0]);
1071 i = sp > bol && iswordchar(sp[-1]);
1072 i ^= iswordchar(sp[0]);
1078 sub.sub[pc->n].sp = sp;
1081 sub.sub[pc->n].ep = sp;
1093 int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1101 sub->nsub = prog->nsub;
1102 for (i = 0; i < MAXSUB; ++i)
1103 sub->sub[i].sp = sub->sub[i].ep = NULL;
1105 return !match(prog->start, sp, sp, prog->flags | eflags, sub);
1109 int main(int argc, char **argv)
1118 p = regcomp(argv[1], 0, &error);
1120 fprintf(stderr, "regcomp: %s\n", error);
1126 printf("nsub = %d\n", p->nsub);
1127 if (!regexec(p, s, &m, 0)) {
1128 for (i = 0; i < m.nsub; ++i) {
1129 int n = m.sub[i].ep - m.sub[i].sp;
1131 printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1133 printf("match %d: n=0 ''\n", i);
1136 printf("no match\n");