]> rtime.felk.cvut.cz Git - hornmich/skoda-qr-demo.git/blob - QRScanner/mobile/jni/thirdparty/mujs/regex.c
Add MuPDF native source codes
[hornmich/skoda-qr-demo.git] / QRScanner / mobile / jni / thirdparty / mujs / regex.c
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <setjmp.h>
5 #include <limits.h>
6
7 #include "regex.h"
8 #include "utf.h"
9
10 #define emit regemit
11 #define next regnext
12 #define accept regaccept
13
14 #define nelem(a) (sizeof (a) / sizeof (a)[0])
15
16 #define REPINF 255
17 #define MAXTHREAD 1000
18 #define MAXSUB REG_MAXSUB
19
20 typedef struct Reclass Reclass;
21 typedef struct Renode Renode;
22 typedef struct Reinst Reinst;
23 typedef struct Rethread Rethread;
24
25 struct Reclass {
26         Rune *end;
27         Rune spans[64];
28 };
29
30 struct Reprog {
31         Reinst *start, *end;
32         int flags;
33         unsigned int nsub;
34         Reclass cclass[16];
35 };
36
37 struct cstate {
38         Reprog *prog;
39         Renode *pstart, *pend;
40
41         const char *source;
42         unsigned int ncclass;
43         unsigned int nsub;
44         Renode *sub[MAXSUB];
45
46         int lookahead;
47         Rune yychar;
48         Reclass *yycc;
49         int yymin, yymax;
50
51         const char *error;
52         jmp_buf kaboom;
53 };
54
55 static void die(struct cstate *g, const char *message)
56 {
57         g->error = message;
58         longjmp(g->kaboom, 1);
59 }
60
61 static int canon(Rune c)
62 {
63         Rune u = toupperrune(c);
64         if (c >= 128 && u < 128)
65                 return c;
66         return u;
67 }
68
69 /* Scan */
70
71 enum {
72         L_CHAR = 256,
73         L_CCLASS,       /* character class */
74         L_NCCLASS,      /* negative character class */
75         L_NC,           /* "(?:" no capture */
76         L_PLA,          /* "(?=" positive lookahead */
77         L_NLA,          /* "(?!" negative lookahead */
78         L_WORD,         /* "\b" word boundary */
79         L_NWORD,        /* "\B" non-word boundary */
80         L_REF,          /* "\1" back-reference */
81         L_COUNT,        /* {M,N} */
82 };
83
84 static int hex(struct cstate *g, int c)
85 {
86         if (c >= '0' && c <= '9') return c - '0';
87         if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
88         if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
89         die(g, "invalid escape sequence");
90         return 0;
91 }
92
93 static int dec(struct cstate *g, int c)
94 {
95         if (c >= '0' && c <= '9') return c - '0';
96         die(g, "invalid quantifier");
97         return 0;
98 }
99
100 #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
101
102 static int isunicodeletter(int c)
103 {
104         return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
105 }
106
107 static int nextrune(struct cstate *g)
108 {
109         g->source += chartorune(&g->yychar, g->source);
110         if (g->yychar == '\\') {
111                 g->source += chartorune(&g->yychar, g->source);
112                 switch (g->yychar) {
113                 case 0: die(g, "unterminated escape sequence");
114                 case 'f': g->yychar = '\f'; return 0;
115                 case 'n': g->yychar = '\n'; return 0;
116                 case 'r': g->yychar = '\r'; return 0;
117                 case 't': g->yychar = '\t'; return 0;
118                 case 'v': g->yychar = '\v'; return 0;
119                 case 'c':
120                         g->yychar = (*g->source++) & 31;
121                         return 0;
122                 case 'x':
123                         g->yychar = hex(g, *g->source++) << 4;
124                         g->yychar += hex(g, *g->source++);
125                         if (g->yychar == 0) {
126                                 g->yychar = '0';
127                                 return 1;
128                         }
129                         return 0;
130                 case 'u':
131                         g->yychar = hex(g, *g->source++) << 12;
132                         g->yychar += hex(g, *g->source++) << 8;
133                         g->yychar += hex(g, *g->source++) << 4;
134                         g->yychar += hex(g, *g->source++);
135                         if (g->yychar == 0) {
136                                 g->yychar = '0';
137                                 return 1;
138                         }
139                         return 0;
140                 }
141                 if (strchr(ESCAPES, g->yychar))
142                         return 1;
143                 if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
144                         die(g, "invalid escape character");
145                 return 0;
146         }
147         return 0;
148 }
149
150 static int lexcount(struct cstate *g)
151 {
152         g->yychar = *g->source++;
153
154         g->yymin = dec(g, g->yychar);
155         g->yychar = *g->source++;
156         while (g->yychar != ',' && g->yychar != '}') {
157                 g->yymin = g->yymin * 10 + dec(g, g->yychar);
158                 g->yychar = *g->source++;
159         }
160         if (g->yymin >= REPINF)
161                 die(g, "numeric overflow");
162
163         if (g->yychar == ',') {
164                 g->yychar = *g->source++;
165                 if (g->yychar == '}') {
166                         g->yymax = REPINF;
167                 } else {
168                         g->yymax = dec(g, g->yychar);
169                         g->yychar = *g->source++;
170                         while (g->yychar != '}') {
171                                 g->yymax = g->yymax * 10 + dec(g, g->yychar);
172                                 g->yychar = *g->source++;
173                         }
174                         if (g->yymax >= REPINF)
175                                 die(g, "numeric overflow");
176                 }
177         } else {
178                 g->yymax = g->yymin;
179         }
180
181         return L_COUNT;
182 }
183
184 static void newcclass(struct cstate *g)
185 {
186         if (g->ncclass >= nelem(g->prog->cclass))
187                 die(g, "too many character classes");
188         g->yycc = g->prog->cclass + g->ncclass++;
189         g->yycc->end = g->yycc->spans;
190 }
191
192 static void addrange(struct cstate *g, Rune a, Rune b)
193 {
194         if (a > b)
195                 die(g, "invalid character class range");
196         if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
197                 die(g, "too many character class ranges");
198         *g->yycc->end++ = a;
199         *g->yycc->end++ = b;
200 }
201
202 static void addranges_d(struct cstate *g)
203 {
204         addrange(g, '0', '9');
205 }
206
207 static void addranges_D(struct cstate *g)
208 {
209         addrange(g, 0, '0'-1);
210         addrange(g, '9'+1, 0xFFFF);
211 }
212
213 static void addranges_s(struct cstate *g)
214 {
215         addrange(g, 0x9, 0x9);
216         addrange(g, 0xA, 0xD);
217         addrange(g, 0x20, 0x20);
218         addrange(g, 0xA0, 0xA0);
219         addrange(g, 0x2028, 0x2029);
220         addrange(g, 0xFEFF, 0xFEFF);
221 }
222
223 static void addranges_S(struct cstate *g)
224 {
225         addrange(g, 0, 0x9-1);
226         addrange(g, 0x9+1, 0xA-1);
227         addrange(g, 0xD+1, 0x20-1);
228         addrange(g, 0x20+1, 0xA0-1);
229         addrange(g, 0xA0+1, 0x2028-1);
230         addrange(g, 0x2029+1, 0xFEFF-1);
231         addrange(g, 0xFEFF+1, 0xFFFF);
232 }
233
234 static void addranges_w(struct cstate *g)
235 {
236         addrange(g, '0', '9');
237         addrange(g, 'A', 'Z');
238         addrange(g, '_', '_');
239         addrange(g, 'a', 'z');
240 }
241
242 static void addranges_W(struct cstate *g)
243 {
244         addrange(g, 0, '0'-1);
245         addrange(g, '9'+1, 'A'-1);
246         addrange(g, 'Z'+1, '_'-1);
247         addrange(g, '_'+1, 'a'-1);
248         addrange(g, 'z'+1, 0xFFFF);
249 }
250
251 static int lexclass(struct cstate *g)
252 {
253         int type = L_CCLASS;
254         int quoted, havesave, havedash;
255         Rune save;
256
257         newcclass(g);
258
259         quoted = nextrune(g);
260         if (!quoted && g->yychar == '^') {
261                 type = L_NCCLASS;
262                 quoted = nextrune(g);
263         }
264
265         havesave = havedash = 0;
266         for (;;) {
267                 if (g->yychar == 0)
268                         die(g, "unterminated character class");
269                 if (!quoted && g->yychar == ']')
270                         break;
271
272                 if (!quoted && g->yychar == '-') {
273                         if (havesave) {
274                                 if (havedash) {
275                                         addrange(g, save, '-');
276                                         havesave = havedash = 0;
277                                 } else {
278                                         havedash = 1;
279                                 }
280                         } else {
281                                 save = '-';
282                                 havesave = 1;
283                         }
284                 } else if (quoted && strchr("DSWdsw", g->yychar)) {
285                         if (havesave) {
286                                 addrange(g, save, save);
287                                 if (havedash)
288                                         addrange(g, '-', '-');
289                         }
290                         switch (g->yychar) {
291                         case 'd': addranges_d(g); break;
292                         case 's': addranges_s(g); break;
293                         case 'w': addranges_w(g); break;
294                         case 'D': addranges_D(g); break;
295                         case 'S': addranges_S(g); break;
296                         case 'W': addranges_W(g); break;
297                         }
298                         havesave = havedash = 0;
299                 } else {
300                         if (quoted) {
301                                 if (g->yychar == 'b')
302                                         g->yychar = '\b';
303                                 else if (g->yychar == '0')
304                                         g->yychar = 0;
305                                 /* else identity escape */
306                         }
307                         if (havesave) {
308                                 if (havedash) {
309                                         addrange(g, save, g->yychar);
310                                         havesave = havedash = 0;
311                                 } else {
312                                         addrange(g, save, save);
313                                         save = g->yychar;
314                                 }
315                         } else {
316                                 save = g->yychar;
317                                 havesave = 1;
318                         }
319                 }
320
321                 quoted = nextrune(g);
322         }
323
324         if (havesave) {
325                 addrange(g, save, save);
326                 if (havedash)
327                         addrange(g, '-', '-');
328         }
329
330         return type;
331 }
332
333 static int lex(struct cstate *g)
334 {
335         int quoted = nextrune(g);
336         if (quoted) {
337                 switch (g->yychar) {
338                 case 'b': return L_WORD;
339                 case 'B': return L_NWORD;
340                 case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
341                 case 's': newcclass(g); addranges_s(g); return L_CCLASS;
342                 case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
343                 case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
344                 case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
345                 case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
346                 case '0': g->yychar = 0; return L_CHAR;
347                 }
348                 if (g->yychar >= '0' && g->yychar <= '9') {
349                         g->yychar -= '0';
350                         if (*g->source >= '0' && *g->source <= '9')
351                                 g->yychar = g->yychar * 10 + *g->source++ - '0';
352                         return L_REF;
353                 }
354                 return L_CHAR;
355         }
356
357         switch (g->yychar) {
358         case 0:
359         case '$': case ')': case '*': case '+':
360         case '.': case '?': case '^': case '|':
361                 return g->yychar;
362         }
363
364         if (g->yychar == '{')
365                 return lexcount(g);
366         if (g->yychar == '[')
367                 return lexclass(g);
368         if (g->yychar == '(') {
369                 if (g->source[0] == '?') {
370                         if (g->source[1] == ':') {
371                                 g->source += 2;
372                                 return L_NC;
373                         }
374                         if (g->source[1] == '=') {
375                                 g->source += 2;
376                                 return L_PLA;
377                         }
378                         if (g->source[1] == '!') {
379                                 g->source += 2;
380                                 return L_NLA;
381                         }
382                 }
383                 return '(';
384         }
385
386         return L_CHAR;
387 }
388
389 /* Parse */
390
391 enum {
392         P_CAT, P_ALT, P_REP,
393         P_BOL, P_EOL, P_WORD, P_NWORD,
394         P_PAR, P_PLA, P_NLA,
395         P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
396         P_REF,
397 };
398
399 struct Renode {
400         unsigned char type;
401         unsigned char ng, m, n;
402         Rune c;
403         Reclass *cc;
404         Renode *x;
405         Renode *y;
406 };
407
408 static Renode *newnode(struct cstate *g, int type)
409 {
410         Renode *node = g->pend++;
411         node->type = type;
412         node->cc = NULL;
413         node->c = 0;
414         node->ng = 0;
415         node->m = 0;
416         node->n = 0;
417         node->x = node->y = NULL;
418         return node;
419 }
420
421 static int empty(Renode *node)
422 {
423         if (!node) return 1;
424         switch (node->type) {
425         default: return 1;
426         case P_CAT: return empty(node->x) && empty(node->y);
427         case P_ALT: return empty(node->x) || empty(node->y);
428         case P_REP: return empty(node->x) || node->m == 0;
429         case P_PAR: return empty(node->x);
430         case P_REF: return empty(node->x);
431         case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
432         }
433 }
434
435 static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
436 {
437         Renode *rep = newnode(g, P_REP);
438         if (max == REPINF && empty(atom))
439                 die(g, "infinite loop matching the empty string");
440         rep->ng = ng;
441         rep->m = min;
442         rep->n = max;
443         rep->x = atom;
444         return rep;
445 }
446
447 static void next(struct cstate *g)
448 {
449         g->lookahead = lex(g);
450 }
451
452 static int accept(struct cstate *g, int t)
453 {
454         if (g->lookahead == t) {
455                 next(g);
456                 return 1;
457         }
458         return 0;
459 }
460
461 static Renode *parsealt(struct cstate *g);
462
463 static Renode *parseatom(struct cstate *g)
464 {
465         Renode *atom;
466         if (g->lookahead == L_CHAR) {
467                 atom = newnode(g, P_CHAR);
468                 atom->c = g->yychar;
469                 next(g);
470                 return atom;
471         }
472         if (g->lookahead == L_CCLASS) {
473                 atom = newnode(g, P_CCLASS);
474                 atom->cc = g->yycc;
475                 next(g);
476                 return atom;
477         }
478         if (g->lookahead == L_NCCLASS) {
479                 atom = newnode(g, P_NCCLASS);
480                 atom->cc = g->yycc;
481                 next(g);
482                 return atom;
483         }
484         if (g->lookahead == L_REF) {
485                 atom = newnode(g, P_REF);
486                 if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
487                         die(g, "invalid back-reference");
488                 atom->n = g->yychar;
489                 atom->x = g->sub[g->yychar];
490                 next(g);
491                 return atom;
492         }
493         if (accept(g, '.'))
494                 return newnode(g, P_ANY);
495         if (accept(g, '(')) {
496                 atom = newnode(g, P_PAR);
497                 if (g->nsub == MAXSUB)
498                         die(g, "too many captures");
499                 atom->n = g->nsub++;
500                 atom->x = parsealt(g);
501                 g->sub[atom->n] = atom;
502                 if (!accept(g, ')'))
503                         die(g, "unmatched '('");
504                 return atom;
505         }
506         if (accept(g, L_NC)) {
507                 atom = parsealt(g);
508                 if (!accept(g, ')'))
509                         die(g, "unmatched '('");
510                 return atom;
511         }
512         if (accept(g, L_PLA)) {
513                 atom = newnode(g, P_PLA);
514                 atom->x = parsealt(g);
515                 if (!accept(g, ')'))
516                         die(g, "unmatched '('");
517                 return atom;
518         }
519         if (accept(g, L_NLA)) {
520                 atom = newnode(g, P_NLA);
521                 atom->x = parsealt(g);
522                 if (!accept(g, ')'))
523                         die(g, "unmatched '('");
524                 return atom;
525         }
526         die(g, "syntax error");
527         return NULL;
528 }
529
530 static Renode *parserep(struct cstate *g)
531 {
532         Renode *atom;
533
534         if (accept(g, '^')) return newnode(g, P_BOL);
535         if (accept(g, '$')) return newnode(g, P_EOL);
536         if (accept(g, L_WORD)) return newnode(g, P_WORD);
537         if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
538
539         atom = parseatom(g);
540         if (g->lookahead == L_COUNT) {
541                 int min = g->yymin, max = g->yymax;
542                 next(g);
543                 if (max < min)
544                         die(g, "invalid quantifier");
545                 return newrep(g, atom, accept(g, '?'), min, max);
546         }
547         if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
548         if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
549         if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
550         return atom;
551 }
552
553 static Renode *parsecat(struct cstate *g)
554 {
555         Renode *cat, *x;
556         if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
557                 cat = parserep(g);
558                 while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
559                         x = cat;
560                         cat = newnode(g, P_CAT);
561                         cat->x = x;
562                         cat->y = parserep(g);
563                 }
564                 return cat;
565         }
566         return NULL;
567 }
568
569 static Renode *parsealt(struct cstate *g)
570 {
571         Renode *alt, *x;
572         alt = parsecat(g);
573         while (accept(g, '|')) {
574                 x = alt;
575                 alt = newnode(g, P_ALT);
576                 alt->x = x;
577                 alt->y = parsecat(g);
578         }
579         return alt;
580 }
581
582 /* Compile */
583
584 enum {
585         I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
586         I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
587         I_BOL, I_EOL, I_WORD, I_NWORD,
588         I_LPAR, I_RPAR
589 };
590
591 struct Reinst {
592         unsigned char opcode;
593         unsigned char n;
594         Rune c;
595         Reclass *cc;
596         Reinst *x;
597         Reinst *y;
598 };
599
600 static unsigned int count(Renode *node)
601 {
602         unsigned int min, max;
603         if (!node) return 0;
604         switch (node->type) {
605         default: return 1;
606         case P_CAT: return count(node->x) + count(node->y);
607         case P_ALT: return count(node->x) + count(node->y) + 2;
608         case P_REP:
609                 min = node->m;
610                 max = node->n;
611                 if (min == max) return count(node->x) * min;
612                 if (max < REPINF) return count(node->x) * max + (max - min);
613                 return count(node->x) * (min + 1) + 2;
614         case P_PAR: return count(node->x) + 2;
615         case P_PLA: return count(node->x) + 2;
616         case P_NLA: return count(node->x) + 2;
617         }
618 }
619
620 static Reinst *emit(Reprog *prog, int opcode)
621 {
622         Reinst *inst = prog->end++;
623         inst->opcode = opcode;
624         inst->n = 0;
625         inst->c = 0;
626         inst->cc = NULL;
627         inst->x = inst->y = NULL;
628         return inst;
629 }
630
631 static void compile(Reprog *prog, Renode *node)
632 {
633         Reinst *inst, *split, *jump;
634         unsigned int i;
635
636         if (!node)
637                 return;
638
639         switch (node->type) {
640         case P_CAT:
641                 compile(prog, node->x);
642                 compile(prog, node->y);
643                 break;
644
645         case P_ALT:
646                 split = emit(prog, I_SPLIT);
647                 compile(prog, node->x);
648                 jump = emit(prog, I_JUMP);
649                 compile(prog, node->y);
650                 split->x = split + 1;
651                 split->y = jump + 1;
652                 jump->x = prog->end;
653                 break;
654
655         case P_REP:
656                 for (i = 0; i < node->m; ++i) {
657                         inst = prog->end;
658                         compile(prog, node->x);
659                 }
660                 if (node->m == node->n)
661                         break;
662                 if (node->n < REPINF) {
663                         for (i = node->m; i < node->n; ++i) {
664                                 split = emit(prog, I_SPLIT);
665                                 compile(prog, node->x);
666                                 if (node->ng) {
667                                         split->y = split + 1;
668                                         split->x = prog->end;
669                                 } else {
670                                         split->x = split + 1;
671                                         split->y = prog->end;
672                                 }
673                         }
674                 } else if (node->m == 0) {
675                         split = emit(prog, I_SPLIT);
676                         compile(prog, node->x);
677                         jump = emit(prog, I_JUMP);
678                         if (node->ng) {
679                                 split->y = split + 1;
680                                 split->x = prog->end;
681                         } else {
682                                 split->x = split + 1;
683                                 split->y = prog->end;
684                         }
685                         jump->x = split;
686                 } else {
687                         split = emit(prog, I_SPLIT);
688                         if (node->ng) {
689                                 split->y = inst;
690                                 split->x = prog->end;
691                         } else {
692                                 split->x = inst;
693                                 split->y = prog->end;
694                         }
695                 }
696                 break;
697
698         case P_BOL: emit(prog, I_BOL); break;
699         case P_EOL: emit(prog, I_EOL); break;
700         case P_WORD: emit(prog, I_WORD); break;
701         case P_NWORD: emit(prog, I_NWORD); break;
702
703         case P_PAR:
704                 inst = emit(prog, I_LPAR);
705                 inst->n = node->n;
706                 compile(prog, node->x);
707                 inst = emit(prog, I_RPAR);
708                 inst->n = node->n;
709                 break;
710         case P_PLA:
711                 split = emit(prog, I_PLA);
712                 compile(prog, node->x);
713                 emit(prog, I_END);
714                 split->x = split + 1;
715                 split->y = prog->end;
716                 break;
717         case P_NLA:
718                 split = emit(prog, I_NLA);
719                 compile(prog, node->x);
720                 emit(prog, I_END);
721                 split->x = split + 1;
722                 split->y = prog->end;
723                 break;
724
725         case P_ANY:
726                 emit(prog, I_ANY);
727                 break;
728         case P_CHAR:
729                 inst = emit(prog, I_CHAR);
730                 inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
731                 break;
732         case P_CCLASS:
733                 inst = emit(prog, I_CCLASS);
734                 inst->cc = node->cc;
735                 break;
736         case P_NCCLASS:
737                 inst = emit(prog, I_NCCLASS);
738                 inst->cc = node->cc;
739                 break;
740         case P_REF:
741                 inst = emit(prog, I_REF);
742                 inst->n = node->n;
743                 break;
744         }
745 }
746
747 #ifdef TEST
748 static void dumpnode(Renode *node)
749 {
750         Rune *p;
751         if (!node) { printf("Empty"); return; }
752         switch (node->type) {
753         case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
754         case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
755         case P_REP:
756                 printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
757                 dumpnode(node->x);
758                 printf(")");
759                 break;
760         case P_BOL: printf("Bol"); break;
761         case P_EOL: printf("Eol"); break;
762         case P_WORD: printf("Word"); break;
763         case P_NWORD: printf("NotWord"); break;
764         case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
765         case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
766         case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
767         case P_ANY: printf("Any"); break;
768         case P_CHAR: printf("Char(%c)", node->c); break;
769         case P_CCLASS:
770                 printf("Class(");
771                 for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
772                 printf(")");
773                 break;
774         case P_NCCLASS:
775                 printf("NotClass(");
776                 for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
777                 printf(")");
778                 break;
779         case P_REF: printf("Ref(%d)", node->n); break;
780         }
781 }
782
783 static void dumpprog(Reprog *prog)
784 {
785         Reinst *inst;
786         int i;
787         for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
788                 printf("% 5d: ", i);
789                 switch (inst->opcode) {
790                 case I_END: puts("end"); break;
791                 case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
792                 case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
793                 case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
794                 case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
795                 case I_ANY: puts("any"); break;
796                 case I_ANYNL: puts("anynl"); break;
797                 case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
798                 case I_CCLASS: puts("cclass"); break;
799                 case I_NCCLASS: puts("ncclass"); break;
800                 case I_REF: printf("ref %d\n", inst->n); break;
801                 case I_BOL: puts("bol"); break;
802                 case I_EOL: puts("eol"); break;
803                 case I_WORD: puts("word"); break;
804                 case I_NWORD: puts("nword"); break;
805                 case I_LPAR: printf("lpar %d\n", inst->n); break;
806                 case I_RPAR: printf("rpar %d\n", inst->n); break;
807                 }
808         }
809 }
810 #endif
811
812 Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
813 {
814         struct cstate g;
815         Renode *node;
816         Reinst *split, *jump;
817         int i;
818
819         g.prog = malloc(sizeof (Reprog));
820         g.pstart = g.pend = malloc(sizeof (Renode) * strlen(pattern) * 2);
821
822         if (setjmp(g.kaboom)) {
823                 if (errorp) *errorp = g.error;
824                 free(g.pstart);
825                 free(g.prog);
826                 return NULL;
827         }
828
829         g.source = pattern;
830         g.ncclass = 0;
831         g.nsub = 1;
832         for (i = 0; i < MAXSUB; ++i)
833                 g.sub[i] = 0;
834
835         g.prog->flags = cflags;
836
837         next(&g);
838         node = parsealt(&g);
839         if (g.lookahead == ')')
840                 die(&g, "unmatched ')'");
841         if (g.lookahead != 0)
842                 die(&g, "syntax error");
843
844         g.prog->nsub = g.nsub;
845         g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst));
846
847         split = emit(g.prog, I_SPLIT);
848         split->x = split + 3;
849         split->y = split + 1;
850         emit(g.prog, I_ANYNL);
851         jump = emit(g.prog, I_JUMP);
852         jump->x = split;
853         emit(g.prog, I_LPAR);
854         compile(g.prog, node);
855         emit(g.prog, I_RPAR);
856         emit(g.prog, I_END);
857
858 #ifdef TEST
859         dumpnode(node);
860         putchar('\n');
861         dumpprog(g.prog);
862 #endif
863
864         free(g.pstart);
865
866         if (errorp) *errorp = NULL;
867         return g.prog;
868 }
869
870 void regfree(Reprog *prog)
871 {
872         if (prog) {
873                 free(prog->start);
874                 free(prog);
875         }
876 }
877
878 /* Match */
879
880 static int isnewline(int c)
881 {
882         return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
883 }
884
885 static int iswordchar(int c)
886 {
887         return c == '_' ||
888                 (c >= 'a' && c <= 'z') ||
889                 (c >= 'A' && c <= 'Z') ||
890                 (c >= '0' && c <= '9');
891 }
892
893 static int incclass(Reclass *cc, Rune c)
894 {
895         Rune *p;
896         for (p = cc->spans; p < cc->end; p += 2)
897                 if (p[0] <= c && c <= p[1])
898                         return 1;
899         return 0;
900 }
901
902 static int incclasscanon(Reclass *cc, Rune c)
903 {
904         Rune *p, r;
905         for (p = cc->spans; p < cc->end; p += 2)
906                 for (r = p[0]; r <= p[1]; ++r)
907                         if (c == canon(r))
908                                 return 1;
909         return 0;
910 }
911
912 static int strncmpcanon(const char *a, const char *b, unsigned int n)
913 {
914         Rune ra, rb;
915         int c;
916         while (n--) {
917                 if (!*a) return -1;
918                 if (!*b) return 1;
919                 a += chartorune(&ra, a);
920                 b += chartorune(&rb, b);
921                 c = canon(ra) - canon(rb);
922                 if (c)
923                         return c;
924         }
925         return 0;
926 }
927
928 struct Rethread {
929         Reinst *pc;
930         const char *sp;
931         Resub sub;
932 };
933
934 static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
935 {
936         t->pc = pc;
937         t->sp = sp;
938         memcpy(&t->sub, sub, sizeof t->sub);
939 }
940
941 static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
942 {
943         Rethread ready[MAXTHREAD];
944         Resub scratch;
945         Resub sub;
946         Rune c;
947         unsigned int nready;
948         int i;
949
950         /* queue initial thread */
951         spawn(ready + 0, pc, sp, out);
952         nready = 1;
953
954         /* run threads in stack order */
955         while (nready > 0) {
956                 --nready;
957                 pc = ready[nready].pc;
958                 sp = ready[nready].sp;
959                 memcpy(&sub, &ready[nready].sub, sizeof sub);
960                 for (;;) {
961                         switch (pc->opcode) {
962                         case I_END:
963                                 for (i = 0; i < MAXSUB; ++i) {
964                                         out->sub[i].sp = sub.sub[i].sp;
965                                         out->sub[i].ep = sub.sub[i].ep;
966                                 }
967                                 return 1;
968                         case I_JUMP:
969                                 pc = pc->x;
970                                 continue;
971                         case I_SPLIT:
972                                 if (nready >= MAXTHREAD) {
973                                         fprintf(stderr, "regexec: backtrack overflow!\n");
974                                         return 0;
975                                 }
976                                 spawn(&ready[nready++], pc->y, sp, &sub);
977                                 pc = pc->x;
978                                 continue;
979
980                         case I_PLA:
981                                 if (!match(pc->x, sp, bol, flags, &sub))
982                                         goto dead;
983                                 pc = pc->y;
984                                 continue;
985                         case I_NLA:
986                                 memcpy(&scratch, &sub, sizeof scratch);
987                                 if (match(pc->x, sp, bol, flags, &scratch))
988                                         goto dead;
989                                 pc = pc->y;
990                                 continue;
991
992                         case I_ANYNL:
993                                 sp += chartorune(&c, sp);
994                                 if (c == 0)
995                                         goto dead;
996                                 break;
997                         case I_ANY:
998                                 sp += chartorune(&c, sp);
999                                 if (c == 0)
1000                                         goto dead;
1001                                 if (isnewline(c))
1002                                         goto dead;
1003                                 break;
1004                         case I_CHAR:
1005                                 sp += chartorune(&c, sp);
1006                                 if (c == 0)
1007                                         goto dead;
1008                                 if (flags & REG_ICASE)
1009                                         c = canon(c);
1010                                 if (c != pc->c)
1011                                         goto dead;
1012                                 break;
1013                         case I_CCLASS:
1014                                 sp += chartorune(&c, sp);
1015                                 if (c == 0)
1016                                         goto dead;
1017                                 if (flags & REG_ICASE) {
1018                                         if (!incclasscanon(pc->cc, canon(c)))
1019                                                 goto dead;
1020                                 } else {
1021                                         if (!incclass(pc->cc, c))
1022                                                 goto dead;
1023                                 }
1024                                 break;
1025                         case I_NCCLASS:
1026                                 sp += chartorune(&c, sp);
1027                                 if (c == 0)
1028                                         goto dead;
1029                                 if (flags & REG_ICASE) {
1030                                         if (incclasscanon(pc->cc, canon(c)))
1031                                                 goto dead;
1032                                 } else {
1033                                         if (incclass(pc->cc, c))
1034                                                 goto dead;
1035                                 }
1036                                 break;
1037                         case I_REF:
1038                                 i = sub.sub[pc->n].ep - sub.sub[pc->n].sp;
1039                                 if (flags & REG_ICASE) {
1040                                         if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
1041                                                 goto dead;
1042                                 } else {
1043                                         if (strncmp(sp, sub.sub[pc->n].sp, i))
1044                                                 goto dead;
1045                                 }
1046                                 if (i > 0)
1047                                         sp += i;
1048                                 break;
1049
1050                         case I_BOL:
1051                                 if (sp == bol && !(flags & REG_NOTBOL))
1052                                         break;
1053                                 if (flags & REG_NEWLINE)
1054                                         if (sp > bol && isnewline(sp[-1]))
1055                                                 break;
1056                                 goto dead;
1057                         case I_EOL:
1058                                 if (*sp == 0)
1059                                         break;
1060                                 if (flags & REG_NEWLINE)
1061                                         if (isnewline(*sp))
1062                                                 break;
1063                                 goto dead;
1064                         case I_WORD:
1065                                 i = sp > bol && iswordchar(sp[-1]);
1066                                 i ^= iswordchar(sp[0]);
1067                                 if (i)
1068                                         break;
1069                                 goto dead;
1070                         case I_NWORD:
1071                                 i = sp > bol && iswordchar(sp[-1]);
1072                                 i ^= iswordchar(sp[0]);
1073                                 if (!i)
1074                                         break;
1075                                 goto dead;
1076
1077                         case I_LPAR:
1078                                 sub.sub[pc->n].sp = sp;
1079                                 break;
1080                         case I_RPAR:
1081                                 sub.sub[pc->n].ep = sp;
1082                                 break;
1083                         default:
1084                                 goto dead;
1085                         }
1086                         pc = pc + 1;
1087                 }
1088 dead: ;
1089         }
1090         return 0;
1091 }
1092
1093 int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1094 {
1095         Resub scratch;
1096         int i;
1097
1098         if (!sub)
1099                 sub = &scratch;
1100
1101         sub->nsub = prog->nsub;
1102         for (i = 0; i < MAXSUB; ++i)
1103                 sub->sub[i].sp = sub->sub[i].ep = NULL;
1104
1105         return !match(prog->start, sp, sp, prog->flags | eflags, sub);
1106 }
1107
1108 #ifdef TEST
1109 int main(int argc, char **argv)
1110 {
1111         const char *error;
1112         const char *s;
1113         Reprog *p;
1114         Resub m;
1115         unsigned int i;
1116
1117         if (argc > 1) {
1118                 p = regcomp(argv[1], 0, &error);
1119                 if (!p) {
1120                         fprintf(stderr, "regcomp: %s\n", error);
1121                         return 1;
1122                 }
1123
1124                 if (argc > 2) {
1125                         s = argv[2];
1126                         printf("nsub = %d\n", p->nsub);
1127                         if (!regexec(p, s, &m, 0)) {
1128                                 for (i = 0; i < m.nsub; ++i) {
1129                                         int n = m.sub[i].ep - m.sub[i].sp;
1130                                         if (n > 0)
1131                                                 printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1132                                         else
1133                                                 printf("match %d: n=0 ''\n", i);
1134                                 }
1135                         } else {
1136                                 printf("no match\n");
1137                         }
1138                 }
1139         }
1140
1141         return 0;
1142 }
1143 #endif