1 /* $DragonFly: src/lib/libcompat/regexp/regexp.c,v 1.2 2004/10/25 19:38:02 drhodus Exp $ */
3 * regcomp and regexec -- regsub and regerror are elsewhere
5 * Copyright (c) 1986 by University of Toronto.
6 * Written by Henry Spencer. Not derived from licensed software.
8 * Permission is granted to anyone to use this software for any
9 * purpose on any computer system, and to redistribute it freely,
10 * subject to the following restrictions:
12 * 1. The author is not responsible for the consequences of use of
13 * this software, no matter how awful, even if they arise
16 * 2. The origin of this software must not be misrepresented, either
17 * by explicit claim or by omission.
19 * 3. Altered versions must be plainly marked as such, and must not
20 * be misrepresented as being the original software.
21 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
22 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
23 *** to assist in implementing egrep.
24 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
25 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
26 *** as in BSD grep and ex.
27 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
28 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
29 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
30 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32 * Beware that some of this code is subtly aware of the way operator
33 * precedence is structured in regular expressions. Serious changes in
34 * regular-expression syntax might require a total rethink.
46 * The "internal use only" fields in regexp.h are present to pass info from
47 * compile to execute that permits the execute phase to run lots faster on
48 * simple cases. They are:
50 * regstart char that must begin a match; '\0' if none obvious
51 * reganch is the match anchored (at beginning-of-line only)?
52 * regmust string (pointer into program) that match must include, or NULL
53 * regmlen length of regmust string
55 * Regstart and reganch permit very fast decisions on suitable starting points
56 * for a match, cutting down the work a lot. Regmust permits fast rejection
57 * of lines that cannot possibly match. The regmust tests are costly enough
58 * that regcomp() supplies a regmust only if the r.e. contains something
59 * potentially expensive (at present, the only such thing detected is * or +
60 * at the start of the r.e., which can involve a lot of backup). Regmlen is
61 * supplied because the test in regexec() needs it and regcomp() is computing
66 * Structure for regexp "program". This is essentially a linear encoding
67 * of a nondeterministic finite-state machine (aka syntax charts or
68 * "railroad normal form" in parsing technology). Each node is an opcode
69 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
70 * all nodes except BRANCH implement concatenation; a "next" pointer with
71 * a BRANCH on both ends of it is connecting two alternatives. (Here we
72 * have one of the subtle syntax dependencies: an individual BRANCH (as
73 * opposed to a collection of them) is never concatenated with anything
74 * because of operator precedence.) The operand of some types of node is
75 * a literal string; for others, it is a node leading into a sub-FSM. In
76 * particular, the operand of a BRANCH node is the first node of the branch.
77 * (NB this is *not* a tree structure: the tail of the branch connects
78 * to the thing following the set of BRANCHes.) The opcodes are:
81 /* definition number opnd? meaning */
82 #define END 0 /* no End of program. */
83 #define BOL 1 /* no Match "" at beginning of line. */
84 #define EOL 2 /* no Match "" at end of line. */
85 #define ANY 3 /* no Match any one character. */
86 #define ANYOF 4 /* str Match any character in this string. */
87 #define ANYBUT 5 /* str Match any character not in this string. */
88 #define BRANCH 6 /* node Match this alternative, or the next... */
89 #define BACK 7 /* no Match "", "next" ptr points backward. */
90 #define EXACTLY 8 /* str Match this string. */
91 #define NOTHING 9 /* no Match empty string. */
92 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
93 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
94 #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */
95 #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */
96 #define OPEN 20 /* no Mark this point in input as start of #n. */
97 /* OPEN+1 is number 1, etc. */
98 #define CLOSE 30 /* no Analogous to OPEN. */
103 * BRANCH The set of branches constituting a single choice are hooked
104 * together with their "next" pointers, since precedence prevents
105 * anything being concatenated to any individual branch. The
106 * "next" pointer of the last BRANCH in a choice points to the
107 * thing following the whole choice. This is also where the
108 * final "next" pointer of each individual branch points; each
109 * branch starts with the operand node of a BRANCH node.
111 * BACK Normal "next" pointers all implicitly point forward; BACK
112 * exists to make loop structures possible.
114 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
115 * BRANCH structures using BACK. Simple cases (one character
116 * per match) are implemented with STAR and PLUS for speed
117 * and to minimize recursive plunges.
119 * OPEN,CLOSE ...are numbered at compile time.
123 * A node is one char of opcode followed by two chars of "next" pointer.
124 * "Next" pointers are stored as two 8-bit pieces, high order first. The
125 * value is a positive offset from the opcode of the node containing it.
126 * An operand, if any, simply follows the node. (Note that much of the
127 * code generation knows about this implicit relationship.)
129 * Using two bytes for the "next" pointer is vast overkill for most things,
130 * but allows patterns to get big without disasters.
133 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
134 #define OPERAND(p) ((p) + 3)
137 * See regmagic.h for one further detail of program structure.
142 * Utility definitions.
145 #define UCHARAT(p) ((int)*(unsigned char *)(p))
147 #define UCHARAT(p) ((int)*(p)&CHARBITS)
150 #define FAIL(m) { regerror(m); return(NULL); }
151 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 * Flags to be passed up and down.
156 #define HASWIDTH 01 /* Known never to match null string. */
157 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
158 #define SPSTART 04 /* Starts with * or +. */
159 #define WORST 0 /* Worst case. */
162 * Global work variables for regcomp().
164 static char *regparse; /* Input-scan pointer. */
165 static int regnpar; /* () count. */
166 static char regdummy;
167 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
168 static long regsize; /* Code size. */
171 * Forward declarations for regcomp()'s friends.
174 #define STATIC static
177 STATIC char *regbranch();
178 STATIC char *regpiece();
179 STATIC char *regatom();
180 STATIC char *regnode();
181 STATIC char *regnext();
183 STATIC void reginsert();
184 STATIC void regtail();
185 STATIC void regoptail();
187 STATIC int strcspn();
191 - regcomp - compile a regular expression into internal code
193 * We can't allocate space until we know how big the compiled form will be,
194 * but we can't compile it (and thus know how big it is) until we've got a
195 * place to put the code. So we cheat: we compile it twice, once with code
196 * generation turned off and size counting turned on, and once "for real".
197 * This also means that we don't allocate space until we are sure that the
198 * thing really will compile successfully, and we never have to move the
199 * code and thus invalidate pointers into it. (Note that it has to be in
200 * one piece because free() must be able to free it all.)
202 * Beware that the optimization-preparation code in here knows about some
203 * of the structure of the compiled regexp.
216 FAIL("NULL argument");
218 /* First pass: determine size, legality. */
220 if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */
222 regparse = (char *)exp;
227 if (reg(0, &flags) == NULL)
230 /* Small enough for pointer-storage convention? */
231 if (regsize >= 32767L) /* Probably could be 65535L. */
232 FAIL("regexp too big");
234 /* Allocate space. */
235 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
237 FAIL("out of space");
239 /* Second pass: emit code. */
240 regparse = (char *)exp;
242 regcode = r->program;
244 if (reg(0, &flags) == NULL)
247 /* Dig out information for optimizations. */
248 r->regstart = '\0'; /* Worst-case defaults. */
252 scan = r->program+1; /* First BRANCH. */
253 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
254 scan = OPERAND(scan);
256 /* Starting-point info. */
257 if (OP(scan) == EXACTLY)
258 r->regstart = *OPERAND(scan);
259 else if (OP(scan) == BOL)
263 * If there's something expensive in the r.e., find the
264 * longest literal string that must appear and make it the
265 * regmust. Resolve ties in favor of later strings, since
266 * the regstart check works with the beginning of the r.e.
267 * and avoiding duplication strengthens checking. Not a
268 * strong reason, but sufficient in the absence of others.
273 for (; scan != NULL; scan = regnext(scan))
274 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
275 longest = OPERAND(scan);
276 len = strlen(OPERAND(scan));
278 r->regmust = longest;
287 - reg - regular expression, i.e. main body or parenthesized thing
289 * Caller must absorb opening parenthesis.
291 * Combining parenthesis handling with the base level of regular expression
292 * is a trifle forced, but the need to tie the tails of the branches to what
293 * follows makes it hard to avoid.
297 int paren; /* Parenthesized? */
306 *flagp = HASWIDTH; /* Tentatively. */
308 /* Make an OPEN node, if parenthesized. */
310 if (regnpar >= NSUBEXP)
314 ret = regnode(OPEN+parno);
318 /* Pick up the branches, linking them together. */
319 br = regbranch(&flags);
323 regtail(ret, br); /* OPEN -> first. */
326 if (!(flags&HASWIDTH))
328 *flagp |= flags&SPSTART;
329 while (*regparse == '|' || *regparse == '\n') {
331 br = regbranch(&flags);
334 regtail(ret, br); /* BRANCH -> BRANCH. */
335 if (!(flags&HASWIDTH))
337 *flagp |= flags&SPSTART;
340 /* Make a closing node, and hook it on the end. */
341 ender = regnode((paren) ? CLOSE+parno : END);
344 /* Hook the tails of the branches to the closing node. */
345 for (br = ret; br != NULL; br = regnext(br))
346 regoptail(br, ender);
348 /* Check for proper termination. */
349 if (paren && *regparse++ != ')') {
350 FAIL("unmatched ()");
351 } else if (!paren && *regparse != '\0') {
352 if (*regparse == ')') {
353 FAIL("unmatched ()");
355 FAIL("junk on end"); /* "Can't happen". */
363 - regbranch - one alternative of an | operator
365 * Implements the concatenation operator.
376 *flagp = WORST; /* Tentatively. */
378 ret = regnode(BRANCH);
380 while (*regparse != '\0' && *regparse != ')' &&
381 *regparse != '\n' && *regparse != '|') {
382 latest = regpiece(&flags);
385 *flagp |= flags&HASWIDTH;
386 if (chain == NULL) /* First piece. */
387 *flagp |= flags&SPSTART;
389 regtail(chain, latest);
392 if (chain == NULL) /* Loop ran zero times. */
393 (void) regnode(NOTHING);
399 - regpiece - something followed by possible [*+?]
401 * Note that the branching code sequences used for ? and the general cases
402 * of * and + are somewhat optimized: they use the same NOTHING node as
403 * both the endmarker for their branch list and the body of the last branch.
404 * It might seem that this node could be dispensed with entirely, but the
405 * endmarker role is not redundant.
416 ret = regatom(&flags);
426 if (!(flags&HASWIDTH) && op != '?')
427 FAIL("*+ operand could be empty");
428 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
430 if (op == '*' && (flags&SIMPLE))
431 reginsert(STAR, ret);
432 else if (op == '*') {
433 /* Emit x* as (x&|), where & means "self". */
434 reginsert(BRANCH, ret); /* Either x */
435 regoptail(ret, regnode(BACK)); /* and loop */
436 regoptail(ret, ret); /* back */
437 regtail(ret, regnode(BRANCH)); /* or */
438 regtail(ret, regnode(NOTHING)); /* null. */
439 } else if (op == '+' && (flags&SIMPLE))
440 reginsert(PLUS, ret);
441 else if (op == '+') {
442 /* Emit x+ as x(&|), where & means "self". */
443 next = regnode(BRANCH); /* Either */
445 regtail(regnode(BACK), ret); /* loop back */
446 regtail(next, regnode(BRANCH)); /* or */
447 regtail(ret, regnode(NOTHING)); /* null. */
448 } else if (op == '?') {
449 /* Emit x? as (x|) */
450 reginsert(BRANCH, ret); /* Either x */
451 regtail(ret, regnode(BRANCH)); /* or */
452 next = regnode(NOTHING); /* null. */
454 regoptail(ret, next);
457 if (ISMULT(*regparse))
464 - regatom - the lowest level
466 * Optimization: gobbles an entire sequence of ordinary characters so that
467 * it can turn them into a single node, which is smaller to store and
468 * faster to run. Backslashed characters are exceptions, each becoming a
469 * separate node; the code is simpler that way and it's not worth fixing.
478 *flagp = WORST; /* Tentatively. */
480 switch (*regparse++) {
481 /* FIXME: these chars only have meaning at beg/end of pat? */
490 *flagp |= HASWIDTH|SIMPLE;
497 if (*regparse == '^') { /* Complement of range. */
498 ret = regnode(ANYBUT);
501 ret = regnode(ANYOF);
502 if (*regparse == ']' || *regparse == '-')
504 while (*regparse != '\0' && *regparse != ']') {
505 if (*regparse == '-') {
507 if (*regparse == ']' || *regparse == '\0')
510 class = UCHARAT(regparse-2);
511 classend = UCHARAT(regparse);
512 if (__collate_load_error) {
513 if (class > classend)
514 FAIL("invalid [] range");
515 for (class++; class <= classend; class++)
518 if (__collate_range_cmp(class, classend) > 0)
519 FAIL("invalid [] range");
520 for (i = 0; i <= UCHAR_MAX; i++)
522 && __collate_range_cmp(class, i) <= 0
523 && __collate_range_cmp(i, classend) <= 0
533 if (*regparse != ']')
534 FAIL("unmatched []");
536 *flagp |= HASWIDTH|SIMPLE;
540 ret = reg(1, &flags);
543 *flagp |= flags&(HASWIDTH|SPSTART);
549 FAIL("internal urp"); /* Supposed to be caught earlier. */
554 FAIL("?+* follows nothing");
557 switch (*regparse++) {
562 ret = regnode(WORDA);
565 ret = regnode(WORDZ);
567 /* FIXME: Someday handle \1, \2, ... */
569 /* Handle general quoted chars in exact-match routine */
576 * Encode a string of characters to be matched exactly.
578 * This is a bit tricky due to quoted chars and due to
579 * '*', '+', and '?' taking the SINGLE char previous
582 * On entry, the char at regparse[-1] is going to go
583 * into the string, no matter what it is. (It could be
584 * following a \ if we are entered from the '\' case.)
586 * Basic idea is to pick up a good char in ch and
587 * examine the next char. If it's *+? then we twiddle.
588 * If it's \ then we frozzle. If it's other magic char
589 * we push ch and terminate the string. If none of the
590 * above, we push ch on the string and go around again.
592 * regprev is used to remember where "the current char"
593 * starts in the string, if due to a *+? we need to back
594 * up and put the current char in a separate, 1-char, string.
595 * When regprev is NULL, ch is the only char in the
596 * string; this is used in *+? handling, and in setting
597 * flags |= SIMPLE at the end.
603 regparse--; /* Look at cur char */
604 ret = regnode(EXACTLY);
605 for ( regprev = 0 ; ; ) {
606 ch = *regparse++; /* Get current char */
607 switch (*regparse) { /* look at next one */
610 regc(ch); /* Add cur to string */
613 case '.': case '[': case '(':
614 case ')': case '|': case '\n':
617 /* FIXME, $ and ^ should not always be magic */
619 regc(ch); /* dump cur char */
620 goto done; /* and we are done */
622 case '?': case '+': case '*':
623 if (!regprev) /* If just ch in str, */
624 goto magic; /* use it */
625 /* End mult-char string one early */
626 regparse = regprev; /* Back up parse */
630 regc(ch); /* Cur char OK */
631 switch (regparse[1]){ /* Look after \ */
635 /* FIXME: Someday handle \1, \2, ... */
636 goto done; /* Not quoted */
638 /* Backup point is \, scan * point is after it. */
641 continue; /* NOT break; */
644 regprev = regparse; /* Set backup point */
649 if (!regprev) /* One char? */
659 - regnode - emit a node
661 static char * /* Location. */
669 if (ret == ®dummy) {
676 *ptr++ = '\0'; /* Null "next" pointer. */
684 - regc - emit (if appropriate) a byte of code
690 if (regcode != ®dummy)
697 - reginsert - insert an operator in front of already-emitted operand
699 * Means relocating the operand.
710 if (regcode == ®dummy) {
721 place = opnd; /* Op node, where operand used to be. */
728 - regtail - set the next-pointer at the end of a node chain
742 /* Find last node. */
745 temp = regnext(scan);
751 if (OP(scan) == BACK)
755 *(scan+1) = (offset>>8)&0377;
756 *(scan+2) = offset&0377;
760 - regoptail - regtail on operand of first argument; nop if operandless
767 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
768 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
770 regtail(OPERAND(p), val);
774 * regexec and friends
778 * Global work variables for regexec().
780 static char *reginput; /* String-input pointer. */
781 static char *regbol; /* Beginning of input, for ^ check. */
782 static char **regstartp; /* Pointer to startp array. */
783 static char **regendp; /* Ditto for endp. */
789 STATIC int regmatch();
790 STATIC int regrepeat();
795 STATIC char *regprop();
799 - regexec - match a regexp against a string
802 regexec(prog, string)
807 extern char *strchr();
810 if (prog == NULL || string == NULL) {
811 regerror("NULL parameter");
815 /* Check validity of program. */
816 if (UCHARAT(prog->program) != MAGIC) {
817 regerror("corrupted program");
821 /* If there is a "must appear" string, look for it. */
822 if (prog->regmust != NULL) {
824 while ((s = strchr(s, prog->regmust[0])) != NULL) {
825 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
826 break; /* Found it. */
829 if (s == NULL) /* Not present. */
833 /* Mark beginning of line for ^ . */
834 regbol = (char *)string;
836 /* Simplest case: anchored match need be tried only once. */
838 return(regtry(prog, string));
840 /* Messy cases: unanchored match. */
842 if (prog->regstart != '\0')
843 /* We know what char it must start with. */
844 while ((s = strchr(s, prog->regstart)) != NULL) {
850 /* We don't -- general case. */
854 } while (*s++ != '\0');
861 - regtry - try match at specific point
863 static int /* 0 failure, 1 success */
873 regstartp = prog->startp;
874 regendp = prog->endp;
878 for (i = NSUBEXP; i > 0; i--) {
882 if (regmatch(prog->program + 1)) {
883 prog->startp[0] = string;
884 prog->endp[0] = reginput;
891 - regmatch - main matching routine
893 * Conceptually the strategy is simple: check to see whether the current
894 * node matches, call self recursively to see whether the rest matches,
895 * and then act accordingly. In practice we make some effort to avoid
896 * recursion, in particular by going through "ordinary" nodes (that don't
897 * need to know whether the rest of the match failed) by a loop instead of
900 static int /* 0 failure, 1 success */
904 char *scan; /* Current node. */
905 char *next; /* Next node. */
909 if (scan != NULL && regnarrate)
910 fprintf(stderr, "%s(\n", regprop(scan));
912 while (scan != NULL) {
915 fprintf(stderr, "%s...\n", regprop(scan));
917 next = regnext(scan);
921 if (reginput != regbol)
925 if (*reginput != '\0')
929 /* Must be looking at a letter, digit, or _ */
930 if ((!isalnum((unsigned char)*reginput)) && *reginput != '_')
932 /* Prev must be BOL or nonword */
933 if (reginput > regbol &&
934 (isalnum((unsigned char)reginput[-1]) || reginput[-1] == '_'))
938 /* Must be looking at non letter, digit, or _ */
939 if (isalnum((unsigned char)*reginput) || *reginput == '_')
941 /* We don't care what the previous char was */
944 if (*reginput == '\0')
952 opnd = OPERAND(scan);
953 /* Inline the first character, for speed. */
954 if (*opnd != *reginput)
957 if (len > 1 && strncmp(opnd, reginput, len) != 0)
963 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
968 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
988 no = OP(scan) - OPEN;
991 if (regmatch(next)) {
993 * Don't set startp if some later
994 * invocation of the same parentheses
997 if (regstartp[no] == NULL)
998 regstartp[no] = save;
1016 no = OP(scan) - CLOSE;
1019 if (regmatch(next)) {
1021 * Don't set endp if some later
1022 * invocation of the same parentheses
1025 if (regendp[no] == NULL)
1035 if (OP(next) != BRANCH) /* No choice. */
1036 next = OPERAND(scan); /* Avoid recursion. */
1040 if (regmatch(OPERAND(scan)))
1043 scan = regnext(scan);
1044 } while (scan != NULL && OP(scan) == BRANCH);
1058 * Lookahead to avoid useless match attempts
1059 * when we know what character comes next.
1062 if (OP(next) == EXACTLY)
1063 nextch = *OPERAND(next);
1064 min = (OP(scan) == STAR) ? 0 : 1;
1066 no = regrepeat(OPERAND(scan));
1068 /* If it could work, try it. */
1069 if (nextch == '\0' || *reginput == nextch)
1072 /* Couldn't or didn't -- back up. */
1074 reginput = save + no;
1080 return(1); /* Success! */
1083 regerror("memory corruption");
1092 * We get here only if there's trouble -- normally "case END" is
1093 * the terminating point.
1095 regerror("corrupted pointers");
1100 - regrepeat - repeatedly match something simple, report how many
1114 count = strlen(scan);
1118 while (*opnd == *scan) {
1124 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1130 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1135 default: /* Oh dear. Called inappropriately. */
1136 regerror("internal foulup");
1137 count = 0; /* Best compromise. */
1146 - regnext - dig the "next" pointer out of a node
1169 STATIC char *regprop();
1172 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1179 char op = EXACTLY; /* Arbitrary non-END op. */
1181 extern char *strchr();
1185 while (op != END) { /* While that wasn't END last time... */
1187 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1189 if (next == NULL) /* Next ptr. */
1192 printf("(%d)", (s-r->program)+(next-s));
1194 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1195 /* Literal string, where present. */
1196 while (*s != '\0') {
1205 /* Header fields of interest. */
1206 if (r->regstart != '\0')
1207 printf("start `%c' ", r->regstart);
1209 printf("anchored ");
1210 if (r->regmust != NULL)
1211 printf("must have \"%s\"", r->regmust);
1216 - regprop - printable representation of opcode
1223 static char buf[50];
1225 (void) strcpy(buf, ":");
1267 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1279 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1295 regerror("corrupted opcode");
1299 (void) strcat(buf, p);
1305 * The following is provided for those people who do not have strcspn() in
1306 * their C libraries. They should get off their butts and do something
1307 * about it; at least one public-domain implementation of those (highly
1308 * useful) string routines has been published on Usenet.
1312 * strcspn - find length of initial segment of s1 consisting entirely
1313 * of characters not from s2
1326 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1327 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1328 if (*scan1 == *scan2++)