1 /* $NetBSD: regcomp.c,v 1.7 2011/11/19 17:45:11 tnozaki Exp $ */
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 * The Regents of the University of California. All rights reserved.
8 * This code is derived from software contributed to Berkeley by
9 * Henry Spencer of the University of Toronto.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * @(#)regcomp.c 8.4 (Berkeley) 3/19/94
42 #if defined(LIBC_SCCS) && !defined(lint)
43 static char sccsid[] = "@(#)regcomp.c 8.4 (Berkeley) 3/19/94";
44 #endif /* LIBC_SCCS and not lint */
46 #include <sys/types.h>
61 * parse structure, passed up and down to avoid global variables and
65 RCHAR_T *next; /* next character in RE */
66 RCHAR_T *end; /* end of string (-> NUL normally) */
67 int error; /* has an error been seen? */
68 sop *strip; /* malloced strip */
69 RCHAR_T *stripdata; /* malloced stripdata */
70 sopno ssize; /* malloced strip size (allocated) */
71 sopno slen; /* malloced strip length (used) */
72 int ncsalloc; /* number of csets allocated */
74 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
75 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
76 sopno pend[NPAREN]; /* -> ) ([0] unused) */
79 /* ========= begin header generated by ./mkh ========= */
84 /* === regcomp.c === */
85 static void p_ere __P((struct parse *p, int stop, size_t reclimit));
86 static void p_ere_exp __P((struct parse *p, size_t reclimit));
87 static void p_str __P((struct parse *p));
88 static void p_bre __P((struct parse *p, int end1, int end2, size_t reclimit));
89 static int p_simp_re __P((struct parse *p, int starordinary, size_t reclimit));
90 static int p_count __P((struct parse *p));
91 static void p_bracket __P((struct parse *p));
92 static void p_b_term __P((struct parse *p, cset *cs));
93 static void p_b_cclass __P((struct parse *p, cset *cs));
94 static void p_b_eclass __P((struct parse *p, cset *cs));
95 static char p_b_symbol __P((struct parse *p));
96 static char p_b_coll_elem __P((struct parse *p, int endc));
97 static char othercase __P((int ch));
98 static void bothcases __P((struct parse *p, int ch));
99 static void ordinary __P((struct parse *p, int ch));
100 static void nonnewline __P((struct parse *p));
101 static void repeat __P((struct parse *p, sopno start, int from, int to, size_t reclimit));
102 static int seterr __P((struct parse *p, int e));
103 static cset *allocset __P((struct parse *p));
104 static void freeset __P((struct parse *p, cset *cs));
105 static int freezeset __P((struct parse *p, cset *cs));
106 static int firstch __P((struct parse *p, cset *cs));
107 static int nch __P((struct parse *p, cset *cs));
108 static void mcadd __P((struct parse *p, cset *cs, const char *cp));
110 static void mcsub __P((cset *cs, char *cp));
111 static int mcin __P((cset *cs, char *cp));
112 static char *mcfind __P((cset *cs, char *cp));
114 static void mcinvert __P((struct parse *p, cset *cs));
115 static void mccase __P((struct parse *p, cset *cs));
117 static int isinsets __P((struct re_guts *g, int c));
118 static int samesets __P((struct re_guts *g, int c1, int c2));
120 static void categorize __P((struct parse *p, struct re_guts *g));
121 static sopno dupl __P((struct parse *p, sopno start, sopno finish));
122 static void doemit __P((struct parse *p, sop op, size_t opnd));
123 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
124 static void dofwd __P((struct parse *p, sopno pos, sop value));
125 static int enlarge __P((struct parse *p, sopno size));
126 static void stripsnug __P((struct parse *p, struct re_guts *g));
127 static void findmust __P((struct parse *p, struct re_guts *g));
128 static sopno pluscount __P((struct parse *p, struct re_guts *g));
133 /* ========= end header generated by ./mkh ========= */
135 static RCHAR_T nuls[10]; /* place to point scanner in event of error */
138 * macros for use with parse structure
139 * BEWARE: these know that the parse structure is named `p' !!!
141 #define PEEK() (*p->next)
142 #define PEEK2() (*(p->next+1))
143 #define MORE() (p->next < p->end)
144 #define MORE2() (p->next+1 < p->end)
145 #define SEE(c) (MORE() && PEEK() == (c))
146 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
147 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
148 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
149 #define NEXT() (p->next++)
150 #define NEXT2() (p->next += 2)
151 #define NEXTn(n) (p->next += (n))
152 #define GETNEXT() (*p->next++)
153 #define SETERROR(e) seterr(p, (e))
154 #define REQUIRE(co, e) ((co) || SETERROR(e))
155 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
156 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
157 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
158 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
159 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
160 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
161 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
162 #define HERE() (p->slen)
163 #define THERE() (p->slen - 1)
164 #define THERETHERE() (p->slen - 2)
165 #define DROP(n) (p->slen -= (n))
168 static int never = 0; /* for use in asserts; shuts lint up */
170 #define never 0 /* some <assert.h>s have bugs too */
173 #define MEMLIMIT 0x8000000
175 ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \
176 (p)->ncsalloc * sizeof(cset) + \
177 (p)->ssize * sizeof(sop))
181 - regcomp - interface for parser and compilation
182 = extern int regcomp(regex_t *, const RCHAR_T *, int);
183 = #define REG_BASIC 0000
184 = #define REG_EXTENDED 0001
185 = #define REG_ICASE 0002
186 = #define REG_NOSUB 0004
187 = #define REG_NEWLINE 0010
188 = #define REG_NOSPEC 0020
189 = #define REG_PEND 0040
190 = #define REG_DUMP 0200
192 int /* 0 success, otherwise REG_something */
193 regcomp(regex_t *preg, const RCHAR_T *pattern, int cflags)
196 register struct re_guts *g;
197 register struct parse *p = &pa;
201 # define GOODFLAGS(f) (f)
203 # define GOODFLAGS(f) ((f)&~REG_DUMP)
206 cflags = GOODFLAGS(cflags);
207 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
210 if (cflags®_PEND) {
211 if (preg->re_endp < pattern)
213 len = preg->re_endp - pattern;
215 len = STRLEN(pattern);
217 /* do the mallocs early so failure handling is easy */
218 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
219 (NC-1)*sizeof(cat_t));
222 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
223 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
224 if (p->strip == NULL) {
228 p->stripdata = (RCHAR_T *)malloc(p->ssize * sizeof(RCHAR_T));
229 if (p->stripdata == NULL) {
230 free((char *)p->strip);
238 p->next = (RCHAR_T *)pattern; /* convenience; we do not modify it */
239 p->end = p->next + len;
242 for (i = 0; i < NPAREN; i++) {
258 g->ncategories = 1; /* category 0 is "everything else" */
259 g->categories = &g->catspace[-(CHAR_MIN)];
260 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
266 g->firststate = THERE();
267 if (cflags®_EXTENDED)
269 else if (cflags®_NOSPEC)
272 p_bre(p, OUT, OUT, 0);
274 g->laststate = THERE();
276 /* tidy up loose ends and fill things in */
280 g->nplus = pluscount(p, g);
282 preg->re_nsub = g->nsub;
284 preg->re_magic = MAGIC1;
286 /* not debugging, so can't rely on the assert() in regexec() */
288 SETERROR(REG_ASSERT);
291 /* win or lose, we're done */
292 if (p->error != 0) /* lose */
298 - p_ere - ERE parser top level, concatenation and alternation
299 == static void p_ere(register struct parse *p, int stop, size_t reclimit);
302 p_ere(register struct parse *p, int stop, size_t reclimit)
304 /* character this ERE should end at */
307 register sopno prevback = 0;
308 register sopno prevfwd = 0;
310 register int first = 1; /* is this the first alternative? */
312 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
313 p->error = REG_ESPACE;
318 /* do a bunch of concatenated expressions */
320 while (MORE() && (c = PEEK()) != '|' && c != stop)
321 p_ere_exp(p, reclimit);
322 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
325 break; /* NOTE BREAK OUT */
328 INSERT(OCH_, conc); /* offset is wrong */
333 ASTERN(OOR1, prevback);
335 AHEAD(prevfwd); /* fix previous offset */
337 EMIT(OOR2, 0); /* offset is very wrong */
340 if (!first) { /* tail-end fixups */
342 ASTERN(O_CH, prevback);
345 assert(!MORE() || SEE(stop));
349 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
350 == static void p_ere_exp(register struct parse *p);
353 p_ere_exp(register struct parse *p, size_t reclimit)
359 register sopno subno;
362 assert(MORE()); /* caller should have ensured this */
368 (void)REQUIRE(MORE(), REG_EPAREN);
372 p->pbegin[subno] = HERE();
373 EMIT(OLPAREN, subno);
375 p_ere(p, ')', reclimit);
376 if (subno < NPAREN) {
377 p->pend[subno] = HERE();
378 assert(p->pend[subno] != 0);
380 EMIT(ORPAREN, subno);
381 (void)MUSTEAT(')', REG_EPAREN);
383 #ifndef POSIX_MISTAKE
384 case ')': /* happens only if no current unmatched ( */
386 * You may ask, why the ifndef? Because I didn't notice
387 * this until slightly too late for 1003.2, and none of the
388 * other 1003.2 regular-expression reviewers noticed it at
389 * all. So an unmatched ) is legal POSIX, at least until
390 * we can get it fixed.
392 SETERROR(REG_EPAREN);
397 p->g->iflags |= USEBOL;
403 p->g->iflags |= USEEOL;
412 SETERROR(REG_BADRPT);
415 if (p->g->cflags®_NEWLINE)
424 (void)REQUIRE(MORE(), REG_EESCAPE);
428 case '{': /* okay as ordinary except if digit follows */
429 (void)REQUIRE(!MORE() || !ISDIGIT((UCHAR_T)PEEK()), REG_BADRPT);
439 /* we call { a repetition if followed by a digit */
440 if (!( c == '*' || c == '+' || c == '?' ||
441 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ))
442 return; /* no repetition, we're done */
445 (void)REQUIRE(!wascaret, REG_BADRPT);
447 case '*': /* implemented as +? */
448 /* this case does not require the (y|) trick, noKLUDGE */
451 INSERT(OQUEST_, pos);
452 ASTERN(O_QUEST, pos);
459 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
460 INSERT(OCH_, pos); /* offset slightly wrong */
461 ASTERN(OOR1, pos); /* this one's right */
462 AHEAD(pos); /* fix the OCH_ */
463 EMIT(OOR2, 0); /* offset very wrong... */
464 AHEAD(THERE()); /* ...so fix it */
465 ASTERN(O_CH, THERETHERE());
470 if (ISDIGIT((UCHAR_T)PEEK())) {
472 (void)REQUIRE(count <= count2, REG_BADBR);
473 } else /* single number with comma */
475 } else /* just a single number */
477 repeat(p, pos, count, count2, 0);
478 if (!EAT('}')) { /* error heuristics */
479 while (MORE() && PEEK() != '}')
481 (void)REQUIRE(MORE(), REG_EBRACE);
490 if (!( c == '*' || c == '+' || c == '?' ||
491 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ) )
493 SETERROR(REG_BADRPT);
497 - p_str - string (no metacharacters) "parser"
498 == static void p_str(register struct parse *p);
501 p_str(register struct parse *p)
503 (void)REQUIRE(MORE(), REG_EMPTY);
505 ordinary(p, GETNEXT());
509 - p_bre - BRE parser top level, anchoring and concatenation
510 == static void p_bre(register struct parse *p, register int end1, \
511 == register int end2, size_t reclimit);
512 * Giving end1 as OUT essentially eliminates the end1/end2 check.
514 * This implementation is a bit of a kludge, in that a trailing $ is first
515 * taken as an ordinary character and then revised to be an anchor. The
516 * only undesirable side effect is that '$' gets included as a character
517 * category in such cases. This is fairly harmless; not worth fixing.
518 * The amount of lookahead needed to avoid this kludge is excessive.
521 p_bre(register struct parse *p, register int end1, register int end2, size_t reclimit)
523 /* first terminating character */
524 /* second terminating character */
526 register sopno start;
527 register int first = 1; /* first subexpression? */
528 register int wasdollar = 0;
530 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
531 p->error = REG_ESPACE;
539 p->g->iflags |= USEBOL;
542 while (MORE() && !SEETWO(end1, end2)) {
543 wasdollar = p_simp_re(p, first, reclimit);
546 if (wasdollar) { /* oops, that was a trailing anchor */
549 p->g->iflags |= USEEOL;
553 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
557 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
558 == static int p_simp_re(register struct parse *p, int starordinary, size_t reclimit);
560 static int /* was the simple RE an unbackslashed $? */
561 p_simp_re(register struct parse *p, int starordinary, size_t reclimit)
563 /* is a leading * an ordinary character? */
570 register sopno subno;
573 pos = HERE(); /* repetion op, if any, covers from here */
575 assert(MORE()); /* caller should have ensured this */
579 (void)REQUIRE(MORE(), REG_EESCAPE);
580 c = (unsigned char)GETNEXT();
583 SETERROR(REG_BADRPT);
589 p->pbegin[subno] = HERE();
590 EMIT(OLPAREN, subno);
591 /* the MORE here is an error heuristic */
592 if (MORE() && !SEETWO('\\', ')'))
593 p_bre(p, '\\', ')', reclimit);
594 if (subno < NPAREN) {
595 p->pend[subno] = HERE();
596 assert(p->pend[subno] != 0);
598 EMIT(ORPAREN, subno);
599 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
601 case ')': /* should not get here -- must be user */
603 SETERROR(REG_EPAREN);
616 if (p->pend[i] != 0) {
617 assert(i <= p->g->nsub);
619 assert(p->pbegin[i] != 0);
620 assert(p->strip[p->pbegin[i]] == OLPAREN);
621 assert(p->strip[p->pend[i]] == ORPAREN);
622 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
625 SETERROR(REG_ESUBREG);
635 if (p->g->cflags®_NEWLINE)
644 (void)REQUIRE(starordinary, REG_BADRPT);
652 if (EAT('*')) { /* implemented as +? */
653 /* this case does not require the (y|) trick, noKLUDGE */
656 INSERT(OQUEST_, pos);
657 ASTERN(O_QUEST, pos);
658 } else if (EATTWO('\\', '{')) {
661 if (MORE() && ISDIGIT((UCHAR_T)PEEK())) {
663 (void)REQUIRE(count <= count2, REG_BADBR);
664 } else /* single number with comma */
666 } else /* just a single number */
668 repeat(p, pos, count, count2, reclimit);
669 if (!EATTWO('\\', '}')) { /* error heuristics */
670 while (MORE() && !SEETWO('\\', '}'))
672 (void)REQUIRE(MORE(), REG_EBRACE);
675 } else if (!backsl && c == (unsigned char)'$') /* $ (but not \$) ends it */
682 - p_count - parse a repetition count
683 == static int p_count(register struct parse *p);
685 static int /* the value */
686 p_count(register struct parse *p)
688 register int count = 0;
689 register int ndigits = 0;
691 while (MORE() && ISDIGIT((UCHAR_T)PEEK()) && count <= DUPMAX) {
692 count = count*10 + (GETNEXT() - '0');
696 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
701 - p_bracket - parse a bracketed character list
702 == static void p_bracket(register struct parse *p);
704 * Note a significant property of this code: if the allocset() did SETERROR,
705 * no set operations are done.
708 p_bracket(register struct parse *p)
711 register int invert = 0;
712 static RCHAR_T bow[] = { '[', ':', '<', ':', ']', ']' };
713 static RCHAR_T eow[] = { '[', ':', '>', ':', ']', ']' };
719 /* Dept of Truly Sickening Special-Case Kludges */
720 if (p->next + 5 < p->end && MEMCMP(p->next, bow, 6) == 0) {
725 if (p->next + 5 < p->end && MEMCMP(p->next, eow, 6) == 0) {
732 invert++; /* make note to invert set at end */
737 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
741 (void)MUSTEAT(']', REG_EBRACK);
743 if (p->error != 0) /* don't mess things up further */
746 if (p->g->cflags®_ICASE) {
750 for (i = p->g->csetsize - 1; i >= 0; i--)
751 if (CHIN(cs, i) && isalpha(i)) {
756 if (cs->multis != NULL)
762 for (i = p->g->csetsize - 1; i >= 0; i--)
767 if (p->g->cflags®_NEWLINE)
769 if (cs->multis != NULL)
773 assert(cs->multis == NULL); /* xxx */
775 if (nch(p, cs) == 1) { /* optimize singleton sets */
776 ordinary(p, firstch(p, cs));
779 EMIT(OANYOF, freezeset(p, cs));
783 - p_b_term - parse one term of a bracketed character list
784 == static void p_b_term(register struct parse *p, register cset *cs);
787 p_b_term(register struct parse *p, register cset *cs)
790 register char start, finish;
793 /* classify what we've got */
794 switch ((MORE()) ? PEEK() : '\0') {
796 c = (MORE2()) ? PEEK2() : '\0';
799 SETERROR(REG_ERANGE);
800 return; /* NOTE RETURN */
808 case ':': /* character class */
810 (void)REQUIRE(MORE(), REG_EBRACK);
812 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
814 (void)REQUIRE(MORE(), REG_EBRACK);
815 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
817 case '=': /* equivalence class */
819 (void)REQUIRE(MORE(), REG_EBRACK);
821 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
823 (void)REQUIRE(MORE(), REG_EBRACK);
824 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
826 default: /* symbol, ordinary character, or range */
827 /* xxx revision needed for multichar stuff */
828 start = p_b_symbol(p);
829 if (SEE('-') && MORE2() && PEEK2() != ']') {
835 finish = p_b_symbol(p);
838 /* xxx what about signed chars here... */
839 (void)REQUIRE(start <= finish, REG_ERANGE);
840 for (i = start; i <= finish; i++)
847 - p_b_cclass - parse a character-class name and deal with it
848 == static void p_b_cclass(register struct parse *p, register cset *cs);
851 p_b_cclass(register struct parse *p, register cset *cs)
853 register RCHAR_T *sp = p->next;
854 register struct cclass *cp;
856 register const char *u;
859 while (MORE() && isalpha(PEEK()))
862 for (cp = cclasses; cp->name != NULL; cp++)
863 if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len))
865 if (cp->name == NULL) {
866 /* oops, didn't find it */
867 SETERROR(REG_ECTYPE);
872 while ((c = *u++) != '\0')
874 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
879 - p_b_eclass - parse an equivalence-class name and deal with it
880 == static void p_b_eclass(register struct parse *p, register cset *cs);
882 * This implementation is incomplete. xxx
885 p_b_eclass(register struct parse *p, register cset *cs)
889 c = p_b_coll_elem(p, '=');
894 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
895 == static char p_b_symbol(register struct parse *p);
897 static char /* value of symbol */
898 p_b_symbol(register struct parse *p)
902 (void)REQUIRE(MORE(), REG_EBRACK);
903 if (!EATTWO('[', '.'))
906 /* collating symbol */
907 value = p_b_coll_elem(p, '.');
908 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
913 - p_b_coll_elem - parse a collating-element name and look it up
914 == static char p_b_coll_elem(register struct parse *p, int endc);
916 static char /* value of collating element */
917 p_b_coll_elem(register struct parse *p, int endc)
919 /* name ended by endc,']' */
921 register RCHAR_T *sp = p->next;
922 register struct cname *cp;
925 while (MORE() && !SEETWO(endc, ']'))
928 SETERROR(REG_EBRACK);
932 for (cp = cnames; cp->name != NULL; cp++)
933 if (STRLEN(cp->name) == len && MEMCMP(cp->name, sp, len))
934 return(cp->code); /* known name */
936 return(*sp); /* single character */
937 SETERROR(REG_ECOLLATE); /* neither */
942 - othercase - return the case counterpart of an alphabetic
943 == static char othercase(int ch);
945 static char /* if no counterpart, return ch */
951 else if (islower(ch))
953 else /* peculiar, but could happen */
958 - bothcases - emit a dualcase version of a two-case character
959 == static void bothcases(register struct parse *p, int ch);
961 * Boy, is this implementation ever a kludge...
964 bothcases(register struct parse *p, int ch)
966 register RCHAR_T *oldnext = p->next;
967 register RCHAR_T *oldend = p->end;
970 assert(othercase(ch) != ch); /* p_bracket() would recurse */
977 assert(p->next == bracket+2);
983 - ordinary - emit an ordinary character
984 == static void ordinary(register struct parse *p, register int ch);
987 ordinary(register struct parse *p, register int ch)
990 register cat_t *cap = p->g->categories;
993 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch)
996 EMIT(OCHAR, (UCHAR_T)ch);
999 cap[ch] = p->g->ncategories++;
1005 - nonnewline - emit REG_NEWLINE version of OANY
1006 == static void nonnewline(register struct parse *p);
1008 * Boy, is this implementation ever a kludge...
1011 nonnewline(register struct parse *p)
1013 register RCHAR_T *oldnext = p->next;
1014 register RCHAR_T *oldend = p->end;
1024 assert(p->next == bracket+3);
1030 - repeat - generate code for a bounded repetition, recursively if needed
1031 == static void repeat(register struct parse *p, sopno start, int from, int to, size_t reclimit);
1034 repeat(register struct parse *p, sopno start, int from, int to, size_t reclimit)
1036 /* operand from here to end of strip */
1037 /* repeated from this number */
1038 /* to this number of times (maybe INFINITY) */
1040 register sopno finish;
1043 # define REP(f, t) ((f)*8 + (t))
1044 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1045 register sopno copy;
1047 if (reclimit++ > RECLIMIT)
1048 p->error = REG_ESPACE;
1056 switch (REP(MAP(from), MAP(to))) {
1057 case REP(0, 0): /* must be user doing this */
1058 DROP(finish-start); /* drop the operand */
1060 case REP(0, 1): /* as x{1,1}? */
1061 case REP(0, N): /* as x{1,n}? */
1062 case REP(0, INF): /* as x{1,}? */
1063 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1064 INSERT(OCH_, start); /* offset is wrong... */
1065 repeat(p, start+1, 1, to, reclimit);
1066 ASTERN(OOR1, start);
1067 AHEAD(start); /* ... fix it */
1070 ASTERN(O_CH, THERETHERE());
1072 case REP(1, 1): /* trivial case */
1075 case REP(1, N): /* as x?x{1,n-1} */
1076 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1077 INSERT(OCH_, start);
1078 ASTERN(OOR1, start);
1080 EMIT(OOR2, 0); /* offset very wrong... */
1081 AHEAD(THERE()); /* ...so fix it */
1082 ASTERN(O_CH, THERETHERE());
1083 copy = dupl(p, start+1, finish+1);
1084 assert(copy == finish+4);
1085 repeat(p, copy, 1, to-1, reclimit);
1087 case REP(1, INF): /* as x+ */
1088 INSERT(OPLUS_, start);
1089 ASTERN(O_PLUS, start);
1091 case REP(N, N): /* as xx{m-1,n-1} */
1092 copy = dupl(p, start, finish);
1093 repeat(p, copy, from-1, to-1, reclimit);
1095 case REP(N, INF): /* as xx{n-1,INF} */
1096 copy = dupl(p, start, finish);
1097 repeat(p, copy, from-1, to, reclimit);
1099 default: /* "can't happen" */
1100 SETERROR(REG_ASSERT); /* just in case */
1106 - seterr - set an error condition
1107 == static int seterr(register struct parse *p, int e);
1109 static int /* useless but makes type checking happy */
1110 seterr(register struct parse *p, int e)
1112 if (p->error == 0) /* keep earliest error condition */
1114 p->next = nuls; /* try to bring things to a halt */
1116 return(0); /* make the return value well-defined */
1120 - allocset - allocate a set of characters for []
1121 == static cset *allocset(register struct parse *p);
1124 allocset(register struct parse *p)
1126 register int no = p->g->ncsets++;
1128 register size_t nbytes;
1130 register size_t css = (size_t)p->g->csetsize;
1133 if (no >= p->ncsalloc) { /* need another column of space */
1134 p->ncsalloc += CHAR_BIT;
1136 assert(nc % CHAR_BIT == 0);
1137 nbytes = nc / CHAR_BIT * css;
1138 if (MEMSIZE(p) > MEMLIMIT)
1140 if (p->g->sets == NULL)
1141 p->g->sets = (cset *)malloc(nc * sizeof(cset));
1143 p->g->sets = (cset *)realloc((char *)p->g->sets,
1145 if (p->g->setbits == NULL)
1146 p->g->setbits = (uch *)malloc(nbytes);
1148 p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1150 /* xxx this isn't right if setbits is now NULL */
1151 for (i = 0; i < no; i++)
1152 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1154 if (p->g->sets != NULL && p->g->setbits != NULL)
1155 (void) memset((char *)p->g->setbits + (nbytes - css),
1160 SETERROR(REG_ESPACE);
1161 /* caller's responsibility not to do set ops */
1166 cs = &p->g->sets[no];
1167 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1168 cs->mask = 1 << ((no) % CHAR_BIT);
1177 - freeset - free a now-unused set
1178 == static void freeset(register struct parse *p, register cset *cs);
1181 freeset(register struct parse *p, register cset *cs)
1184 register cset *top = &p->g->sets[p->g->ncsets];
1185 register size_t css = (size_t)p->g->csetsize;
1187 for (i = 0; i < css; i++)
1189 if (cs == top-1) /* recover only the easy case */
1194 - freezeset - final processing on a set of characters
1195 == static int freezeset(register struct parse *p, register cset *cs);
1197 * The main task here is merging identical sets. This is usually a waste
1198 * of time (although the hash code minimizes the overhead), but can win
1199 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1200 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1203 static int /* set number */
1204 freezeset(register struct parse *p, register cset *cs)
1206 register uch h = cs->hash;
1208 register cset *top = &p->g->sets[p->g->ncsets];
1210 register size_t css = (size_t)p->g->csetsize;
1212 /* look for an earlier one which is the same */
1213 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1214 if (cs2->hash == h && cs2 != cs) {
1216 for (i = 0; i < css; i++)
1217 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1223 if (cs2 < top) { /* found one */
1228 return((int)(cs - p->g->sets));
1232 - firstch - return first character in a set (which must have at least one)
1233 == static int firstch(register struct parse *p, register cset *cs);
1235 static int /* character; there is no "none" value */
1236 firstch(register struct parse *p, register cset *cs)
1239 register size_t css = (size_t)p->g->csetsize;
1241 for (i = 0; i < css; i++)
1245 return(0); /* arbitrary */
1249 - nch - number of characters in a set
1250 == static int nch(register struct parse *p, register cset *cs);
1253 nch(register struct parse *p, register cset *cs)
1256 register size_t css = (size_t)p->g->csetsize;
1259 for (i = 0; i < css; i++)
1266 - mcadd - add a collating element to a cset
1267 == static void mcadd(register struct parse *p, register cset *cs, \
1268 == register char *cp);
1271 mcadd(register struct parse *p, register cset *cs, register const char *cp)
1273 register size_t oldend = cs->smultis;
1276 cs->smultis += strlen(cp) + 1;
1277 np = realloc(cs->multis, cs->smultis);
1282 SETERROR(REG_ESPACE);
1287 (void) strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1292 - mcsub - subtract a collating element from a cset
1293 == static void mcsub(register cset *cs, register char *cp);
1296 mcsub(register cset *cs, register char *cp)
1298 register char *fp = mcfind(cs, cp);
1299 register size_t len = strlen(fp);
1302 (void) memmove(fp, fp + len + 1,
1303 cs->smultis - (fp + len + 1 - cs->multis));
1306 if (cs->smultis == 0) {
1312 cs->multis = realloc(cs->multis, cs->smultis);
1313 assert(cs->multis != NULL);
1317 - mcin - is a collating element in a cset?
1318 == static int mcin(register cset *cs, register char *cp);
1321 mcin(register cset *cs, register char *cp)
1323 return(mcfind(cs, cp) != NULL);
1327 - mcfind - find a collating element in a cset
1328 == static char *mcfind(register cset *cs, register char *cp);
1331 mcfind(register cset *cs, register char *cp)
1335 if (cs->multis == NULL)
1337 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1338 if (strcmp(cp, p) == 0)
1345 - mcinvert - invert the list of collating elements in a cset
1346 == static void mcinvert(register struct parse *p, register cset *cs);
1348 * This would have to know the set of possibilities. Implementation
1352 mcinvert(register struct parse *p, register cset *cs)
1354 assert(cs->multis == NULL); /* xxx */
1358 - mccase - add case counterparts of the list of collating elements in a cset
1359 == static void mccase(register struct parse *p, register cset *cs);
1361 * This would have to know the set of possibilities. Implementation
1365 mccase(register struct parse *p, register cset *cs)
1367 assert(cs->multis == NULL); /* xxx */
1372 - isinsets - is this character in any sets?
1373 == static int isinsets(register struct re_guts *g, int c);
1375 static int /* predicate */
1376 isinsets(register struct re_guts *g, int c)
1380 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1381 register unsigned uc = (unsigned char)c;
1383 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1390 - samesets - are these two characters in exactly the same sets?
1391 == static int samesets(register struct re_guts *g, int c1, int c2);
1393 static int /* predicate */
1394 samesets(register struct re_guts *g, int c1, int c2)
1398 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1399 register unsigned uc1 = (unsigned char)c1;
1400 register unsigned uc2 = (unsigned char)c2;
1402 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1403 if (col[uc1] != col[uc2])
1410 - categorize - sort out character categories
1411 == static void categorize(struct parse *p, register struct re_guts *g);
1414 categorize(struct parse *p, register struct re_guts *g)
1417 register cat_t *cats = g->categories;
1422 /* avoid making error situations worse */
1426 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1427 if (cats[c] == 0 && isinsets(g, c)) {
1428 cat = g->ncategories++;
1430 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1431 if (cats[c2] == 0 && samesets(g, c, c2))
1438 - dupl - emit a duplicate of a bunch of sops
1439 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1441 static sopno /* start of duplicate */
1442 dupl(register struct parse *p, sopno start, sopno finish)
1445 /* to this less one */
1447 register sopno ret = HERE();
1448 register sopno len = finish - start;
1450 assert(finish >= start);
1453 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1455 assert(p->ssize >= p->slen + len);
1456 (void) memcpy((char *)(p->strip + p->slen),
1457 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1458 (void) memcpy((char *)(p->stripdata + p->slen),
1459 (char *)(p->stripdata + start), (size_t)len*sizeof(RCHAR_T));
1465 - doemit - emit a strip operator
1466 == static void doemit(register struct parse *p, sop op, size_t opnd);
1468 * It might seem better to implement this as a macro with a function as
1469 * hard-case backup, but it's just too big and messy unless there are
1470 * some changes to the data structures. Maybe later.
1473 doemit(register struct parse *p, sop op, size_t opnd)
1475 /* avoid making error situations worse */
1479 /* deal with oversize operands ("can't happen", more or less) */
1482 /* deal with undersized strip */
1483 if (p->slen >= p->ssize)
1484 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1487 /* finally, it's all reduced to the easy case */
1488 p->strip[p->slen] = op;
1489 p->stripdata[p->slen] = opnd;
1494 - doinsert - insert a sop into the strip
1495 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1498 doinsert(register struct parse *p, sop op, size_t opnd, sopno pos)
1505 /* avoid making error situations worse */
1510 EMIT(op, opnd); /* do checks, ensure space */
1511 assert(HERE() == sn+1);
1513 d = p->stripdata[sn];
1515 /* adjust paren pointers */
1517 for (i = 1; i < NPAREN; i++) {
1518 if (p->pbegin[i] >= pos) {
1521 if (p->pend[i] >= pos) {
1526 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1527 (HERE()-pos-1)*sizeof(sop));
1528 memmove((char *)&p->stripdata[pos+1], (char *)&p->stripdata[pos],
1529 (HERE()-pos-1)*sizeof(RCHAR_T));
1531 p->stripdata[pos] = d;
1535 - dofwd - complete a forward reference
1536 == static void dofwd(register struct parse *p, sopno pos, sop value);
1539 dofwd(register struct parse *p, register sopno pos, sop value)
1541 /* avoid making error situations worse */
1546 p->stripdata[pos] = value;
1550 - enlarge - enlarge the strip
1551 == static int enlarge(register struct parse *p, sopno size);
1554 enlarge(register struct parse *p, register sopno size)
1557 register RCHAR_T *dp;
1560 if (p->ssize >= size)
1565 if (MEMSIZE(p) > MEMLIMIT)
1567 sp = realloc(p->strip, p->ssize * sizeof(sop));
1571 dp = realloc(p->stripdata, p->ssize * sizeof(RCHAR_T));
1575 SETERROR(REG_ESPACE);
1583 - stripsnug - compact the strip
1584 == static void stripsnug(register struct parse *p, register struct re_guts *g);
1587 stripsnug(register struct parse *p, register struct re_guts *g)
1589 g->nstates = p->slen;
1590 g->strip = (sop *)realloc((char *)p->strip,
1591 p->slen * sizeof(sop));
1592 if (g->strip == NULL) {
1593 SETERROR(REG_ESPACE);
1594 g->strip = p->strip;
1596 g->stripdata = (RCHAR_T *)realloc((char *)p->stripdata,
1597 p->slen * sizeof(RCHAR_T));
1598 if (g->stripdata == NULL) {
1599 SETERROR(REG_ESPACE);
1600 g->stripdata = p->stripdata;
1605 - findmust - fill in must and mlen with longest mandatory literal string
1606 == static void findmust(register struct parse *p, register struct re_guts *g);
1608 * This algorithm could do fancy things like analyzing the operands of |
1609 * for common subsequences. Someday. This code is simple and finds most
1610 * of the interesting cases.
1612 * Note that must and mlen got initialized during setup.
1615 findmust(struct parse *p, register struct re_guts *g)
1617 register sop *scans;
1618 register RCHAR_T *scand;
1620 RCHAR_T *startd = NULL;
1621 register sop *newstarts = 0;
1622 register RCHAR_T *newstartd = NULL;
1623 register sopno newlen;
1626 register RCHAR_T *cp;
1629 /* avoid making error situations worse */
1633 /* find the longest OCHAR sequence in strip */
1635 scans = g->strip + 1;
1636 scand = g->stripdata + 1;
1641 case OCHAR: /* sequence member */
1642 if (newlen == 0) { /* new sequence */
1643 newstarts = scans - 1;
1644 newstartd = scand - 1;
1648 case OPLUS_: /* things that don't break one */
1652 case OQUEST_: /* things that must be skipped */
1661 /* assert() interferes w debug printouts */
1662 if (s != O_QUEST && s != O_CH && s != OOR2) {
1666 } while (s != O_QUEST && s != O_CH);
1668 default: /* things that break a sequence */
1669 if (newlen > g->mlen) { /* ends one */
1677 } while (s != OEND);
1679 if (g->mlen == 0) /* there isn't one */
1682 /* turn it into a character string */
1683 g->must = malloc(((size_t)g->mlen + 1) * sizeof(RCHAR_T));
1684 if (g->must == NULL) { /* argh; just forget it */
1691 for (i = g->mlen; i > 0; i--) {
1698 assert(cp < g->must + g->mlen);
1701 assert(cp == g->must + g->mlen);
1702 *cp++ = '\0'; /* just on general principles */
1706 - pluscount - count + nesting
1707 == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1709 static sopno /* nesting depth */
1710 pluscount(struct parse *p, register struct re_guts *g)
1714 register sopno plusnest = 0;
1715 register sopno maxnest = 0;
1718 return(0); /* there may not be an OEND */
1720 scan = g->strip + 1;
1728 if (plusnest > maxnest)
1733 } while (s != OEND);