From: Peter Avalos Date: Tue, 27 Jan 2009 14:17:46 +0000 (-0500) Subject: Sync libc/regex with FreeBSD: X-Git-Tag: v2.3.1~152^2~61 X-Git-Url: https://gitweb.dragonflybsd.org/~nant/dragonfly.git/commitdiff_plain/a0bb9527e66ae20ed57597e6fdafbcfc65888244 Sync libc/regex with FreeBSD: -Make regular expression matching aware of multibyte characters. -Some WARNS cleanup. -Only stop evaluation of a back reference if the match length is zero and the recursion level is too deep. -Add restrict type-qualifier. --- diff --git a/include/regex.h b/include/regex.h index 8456891008..08594979e0 100644 --- a/include/regex.h +++ b/include/regex.h @@ -35,6 +35,7 @@ * SUCH DAMAGE. * * @(#)regex.h 8.2 (Berkeley) 1/3/94 + * $FreeBSD: src/include/regex.h,v 1.11 2004/07/12 06:07:26 tjr Exp $ * $DragonFly: src/include/regex.h,v 1.3 2008/06/02 06:50:08 hasso Exp $ */ @@ -45,7 +46,12 @@ #include /* types */ -typedef off_t regoff_t; +typedef __off_t regoff_t; + +#ifndef _SIZE_T_DECLARED +typedef __size_t size_t; +#define _SIZE_T_DECLARED +#endif typedef struct { int re_magic; @@ -70,6 +76,7 @@ typedef struct { #define REG_DUMP 0200 /* regerror() flags */ +#define REG_ENOSYS (-1) #define REG_NOMATCH 1 #define REG_BADPAT 2 #define REG_ECOLLATE 3 @@ -86,6 +93,7 @@ typedef struct { #define REG_EMPTY 14 #define REG_ASSERT 15 #define REG_INVARG 16 +#define REG_ILLSEQ 17 #define REG_ATOI 255 /* convert name to number (!) */ #define REG_ITOA 0400 /* convert number to name (!) */ @@ -98,11 +106,16 @@ typedef struct { #define REG_BACKR 02000 /* force use of backref code */ __BEGIN_DECLS -int regcomp (regex_t *, const char *, int); -size_t regerror (int, const regex_t *, char *, size_t); -int regexec (const regex_t *, - const char *, size_t, regmatch_t [], int); -void regfree (regex_t *); +int regcomp(regex_t * __restrict, const char * __restrict, int); +size_t regerror(int, const regex_t * __restrict, char * __restrict, size_t); +/* + * XXX fourth parameter should be `regmatch_t [__restrict]', but isn't because + * of a bug in GCC (when -std=c99 is specified) which perceives this as a + * syntax error. + */ +int regexec(const regex_t * __restrict, const char * __restrict, size_t, + regmatch_t * __restrict, int); +void regfree(regex_t *); __END_DECLS #endif /* !_REGEX_H_ */ diff --git a/lib/libc/regex/cclass.h b/lib/libc/regex/cclass.h deleted file mode 100644 index 4e601a3cd4..0000000000 --- a/lib/libc/regex/cclass.h +++ /dev/null @@ -1,63 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)cclass.h 8.3 (Berkeley) 3/20/94 - * $DragonFly: src/lib/libc/regex/cclass.h,v 1.3 2008/06/05 18:06:30 swildner Exp $ - */ - - -typedef enum {CALNUM, CALPHA, CBLANK, CCNTRL, CDIGIT, CGRAPH, - CLOWER, CPRINT, CPUNCT, CSPACE, CUPPER, CXDIGIT} citype; - -/* character-class table */ -static struct cclass { - char *name; - citype fidx; -} cclasses[] = { - {"alnum", CALNUM}, - {"alpha", CALPHA}, - {"blank", CBLANK}, - {"cntrl", CCNTRL}, - {"digit", CDIGIT}, - {"graph", CGRAPH}, - {"lower", CLOWER}, - {"print", CPRINT}, - {"punct", CPUNCT}, - {"space", CSPACE}, - {"upper", CUPPER}, - {"xdigit", CXDIGIT}, - {NULL, 0} -}; diff --git a/lib/libc/regex/cname.h b/lib/libc/regex/cname.h index 634e88e79c..018728e978 100644 --- a/lib/libc/regex/cname.h +++ b/lib/libc/regex/cname.h @@ -14,10 +14,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -35,6 +31,7 @@ * SUCH DAMAGE. * * @(#)cname.h 8.3 (Berkeley) 3/20/94 + * $FreeBSD: src/lib/libc/regex/cname.h,v 1.4 2007/01/09 00:28:04 imp Exp $ * $DragonFly: src/lib/libc/regex/cname.h,v 1.2 2005/04/27 11:26:18 joerg Exp $ */ diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c index a92832db81..5e589f0331 100644 --- a/lib/libc/regex/engine.c +++ b/lib/libc/regex/engine.c @@ -14,10 +14,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -35,8 +31,7 @@ * SUCH DAMAGE. * * @(#)engine.c 8.5 (Berkeley) 3/20/94 - * - * $FreeBSD: src/lib/libc/regex/engine.c,v 1.5.8.1 2000/07/31 06:30:37 dcs Exp $ + * $FreeBSD: src/lib/libc/regex/engine.c,v 1.21 2007/05/25 12:44:58 delphij Exp $ * $DragonFly: src/lib/libc/regex/engine.c,v 1.7 2005/11/20 09:18:37 swildner Exp $ */ @@ -69,22 +64,34 @@ #define at lat #define match lmat #endif +#ifdef MNAMES +#define matcher mmatcher +#define fast mfast +#define slow mslow +#define dissect mdissect +#define backref mbackref +#define step mstep +#define print mprint +#define at mat +#define match mmat +#endif /* another structure passed up and down to avoid zillions of parameters */ struct match { struct re_guts *g; int eflags; regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ - char *offp; /* offsets work from here */ - char *beginp; /* start of string -- virtual NUL precedes */ - char *endp; /* end of string -- virtual NUL here */ - char *coldp; /* can be no match starting before here */ - char **lastpos; /* [nplus+1] */ + const char *offp; /* offsets work from here */ + const char *beginp; /* start of string -- virtual NUL precedes */ + const char *endp; /* end of string -- virtual NUL here */ + const char *coldp; /* can be no match starting before here */ + const char **lastpos; /* [nplus+1] */ STATEVARS; states st; /* current states */ states fresh; /* states for a fresh start */ states tmp; /* temporary */ states empty; /* empty set of states */ + mbstate_t mbs; /* multibyte conversion state */ }; /* ========= begin header generated by ./mkh ========= */ @@ -93,29 +100,29 @@ extern "C" { #endif /* === engine.c === */ -static int matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags); -static char *dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst); -static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev); -static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst); -static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst); -static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft); -#define BOL (OUT+1) -#define EOL (BOL+1) -#define BOLEOL (BOL+2) -#define NOTHING (BOL+3) -#define BOW (BOL+4) -#define EOW (BOL+5) -#define CODEMAX (BOL+5) /* highest code used */ -#define NONCHAR(c) ((c) > CHAR_MAX) -#define NNONCHAR (CODEMAX-CHAR_MAX) +static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); +static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); +static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); +static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); +static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); +static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); +#define MAX_RECURSION 100 +#define BOL (OUT-1) +#define EOL (BOL-1) +#define BOLEOL (BOL-2) +#define NOTHING (BOL-3) +#define BOW (BOL-4) +#define EOW (BOL-5) +#define BADCHAR (BOL-6) +#define NONCHAR(c) ((c) <= OUT) #ifdef REDEBUG -static void print(struct match *m, char *caption, states st, int ch, FILE *d); +static void print(struct match *m, const char *caption, states st, int ch, FILE *d); #endif #ifdef REDEBUG -static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst); +static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); #endif #ifdef REDEBUG -static char *pchar(int ch); +static const char *pchar(int ch); #endif #ifdef __cplusplus @@ -135,32 +142,33 @@ static char *pchar(int ch); /* - matcher - the actual matching engine - == static int matcher(struct re_guts *g, char *string, \ + == static int matcher(struct re_guts *g, const char *string, \ == size_t nmatch, regmatch_t pmatch[], int eflags); */ static int /* 0 success, REG_NOMATCH failure */ -matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], +matcher(struct re_guts *g, + const char *string, + size_t nmatch, + regmatch_t pmatch[], int eflags) { - char *endp; + const char *endp; int i; struct match mv; struct match *m = &mv; - char *dp; + const char *dp; const sopno gf = g->firststate+1; /* +1 for OEND */ const sopno gl = g->laststate; - char *start; - char *stop; + const char *start; + const char *stop; /* Boyer-Moore algorithms variables */ - char *pp; + const char *pp; int cj, mj; - char *mustfirst; - char *mustlast; + const char *mustfirst; + const char *mustlast; int *matchjump; int *charjump; - dp = NULL; - /* simplify the situation where possible */ if (g->cflags®_NOSUB) nmatch = 0; @@ -184,8 +192,8 @@ matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], pp = mustlast; for (dp = start+g->mlen-1; dp < stop;) { /* Fast skip non-matches */ - while (dp < stop && charjump[*dp]) - dp += charjump[*dp]; + while (dp < stop && charjump[(int)*dp]) + dp += charjump[(int)*dp]; if (dp >= stop) break; @@ -201,7 +209,7 @@ matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], /* Jump to next possible match */ mj = matchjump[pp - mustfirst]; - cj = charjump[*dp]; + cj = charjump[(int)*dp]; dp += (cj < mj ? mj : cj); pp = mustlast; } @@ -232,6 +240,7 @@ matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], SETUP(m->tmp); SETUP(m->empty); CLEAR(m->empty); + ZAPSTATE(&m->mbs); /* Adjust start according to moffset, to speed things up */ if (g->moffset > -1) @@ -259,7 +268,8 @@ matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], if (endp != NULL) break; assert(m->coldp < m->endp); - m->coldp++; + m->coldp += XMBRTOWC(NULL, m->coldp, + m->endp - m->coldp, &m->mbs, 0); } if (nmatch == 1 && !g->backrefs) break; /* no further info needed */ @@ -279,15 +289,15 @@ matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], dp = dissect(m, m->coldp, endp, gf, gl); } else { if (g->nplus > 0 && m->lastpos == NULL) - m->lastpos = (char **)malloc((g->nplus+1) * - sizeof(char *)); + m->lastpos = malloc((g->nplus+1) * + sizeof(const char *)); if (g->nplus > 0 && m->lastpos == NULL) { free(m->pmatch); STATETEARDOWN(m); return(REG_ESPACE); } NOTE("backref dissect"); - dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); + dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); } if (dp != NULL) break; @@ -310,7 +320,7 @@ matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], } #endif NOTE("backoff dissect"); - dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); + dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); } assert(dp == NULL || dp == endp); if (dp != NULL) /* found a shorter one */ @@ -318,7 +328,9 @@ matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], /* despite initial appearances, there is no match here */ NOTE("false alarm"); - start = m->coldp + 1; /* recycle starting later */ + /* recycle starting later */ + start = m->coldp + XMBRTOWC(NULL, m->coldp, + stop - m->coldp, &m->mbs, 0); assert(start <= stop); } @@ -348,25 +360,29 @@ matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], /* - dissect - figure out what matched what, no back references - == static char *dissect(struct match *m, char *start, \ - == char *stop, sopno startst, sopno stopst); + == static const char *dissect(struct match *m, const char *start, \ + == const char *stop, sopno startst, sopno stopst); */ -static char * /* == stop (success) always */ -dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) +static const char * /* == stop (success) always */ +dissect(struct match *m, + const char *start, + const char *stop, + sopno startst, + sopno stopst) { int i; sopno ss; /* start sop of current subRE */ sopno es; /* end sop of current subRE */ - char *sp; /* start of string matched by it */ - char *stp; /* string matched by it cannot pass here */ - char *rest; /* start of rest of string */ - char *tail; /* string unmatched by rest of RE */ + const char *sp; /* start of string matched by it */ + const char *stp; /* string matched by it cannot pass here */ + const char *rest; /* start of rest of string */ + const char *tail; /* string unmatched by rest of RE */ sopno ssub; /* start sop of subsubRE */ sopno esub; /* end sop of subsubRE */ - char *ssp; /* start of string matched by subsubRE */ - char *sep; /* end of string matched by subsubRE */ - char *oldssp; /* previous ssp */ - char *dp; + const char *ssp; /* start of string matched by subsubRE */ + const char *sep; /* end of string matched by subsubRE */ + const char *oldssp; /* previous ssp */ + const char *dp; AT("diss", start, stop, startst, stopst); sp = start; @@ -391,7 +407,7 @@ dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) assert(nope); break; case OCHAR: - sp++; + sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); break; case OBOL: case OEOL: @@ -400,7 +416,7 @@ dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) break; case OANY: case OANYOF: - sp++; + sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); break; case OBACK_: case O_BACK: @@ -531,25 +547,31 @@ dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) /* - backref - figure out what matched what, figuring in back references - == static char *backref(struct match *m, char *start, \ - == char *stop, sopno startst, sopno stopst, sopno lev); + == static const char *backref(struct match *m, const char *start, \ + == const char *stop, sopno startst, sopno stopst, sopno lev); */ -static char * /* == stop (success) or NULL (failure) */ -backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, - sopno lev) /* PLUS nesting level */ +static const char * /* == stop (success) or NULL (failure) */ +backref(struct match *m, + const char *start, + const char *stop, + sopno startst, + sopno stopst, + sopno lev, /* PLUS nesting level */ + int rec) { int i; sopno ss; /* start sop of current subRE */ - char *sp; /* start of string matched by it */ + const char *sp; /* start of string matched by it */ sopno ssub; /* start sop of subsubRE */ sopno esub; /* end sop of subsubRE */ - char *ssp; /* start of string matched by subsubRE */ - char *dp; + const char *ssp; /* start of string matched by subsubRE */ + const char *dp; size_t len; int hard; sop s; regoff_t offsave; cset *cs; + wint_t wc; AT("back", start, stop, startst, stopst); sp = start; @@ -559,17 +581,25 @@ backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, for (ss = startst; !hard && ss < stopst; ss++) switch (OP(s = m->g->strip[ss])) { case OCHAR: - if (sp == stop || *sp++ != (char)OPND(s)) + if (sp == stop) + return(NULL); + sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); + if (wc != OPND(s)) return(NULL); break; case OANY: if (sp == stop) return(NULL); - sp++; + sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); + if (wc == BADCHAR) + return (NULL); break; case OANYOF: + if (sp == stop) + return (NULL); cs = &m->g->sets[OPND(s)]; - if (sp == stop || !CHIN(cs, *sp++)) + sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); + if (wc == BADCHAR || !CHIN(cs, wc)) return(NULL); break; case OBOL: @@ -642,6 +672,8 @@ backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, return(NULL); assert(m->pmatch[i].rm_so != -1); len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; + if (len == 0 && rec++ > MAX_RECURSION) + return(NULL); assert(stop - m->beginp >= len); if (sp > stop - len) return(NULL); /* not enough left to match */ @@ -650,28 +682,28 @@ backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, return(NULL); while (m->g->strip[ss] != SOP(O_BACK, i)) ss++; - return(backref(m, sp+len, stop, ss+1, stopst, lev)); + return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); break; case OQUEST_: /* to null or not */ - dp = backref(m, sp, stop, ss+1, stopst, lev); + dp = backref(m, sp, stop, ss+1, stopst, lev, rec); if (dp != NULL) return(dp); /* not */ - return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); + return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); break; case OPLUS_: assert(m->lastpos != NULL); assert(lev+1 <= m->g->nplus); m->lastpos[lev+1] = sp; - return(backref(m, sp, stop, ss+1, stopst, lev+1)); + return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); break; case O_PLUS: if (sp == m->lastpos[lev]) /* last pass matched null */ - return(backref(m, sp, stop, ss+1, stopst, lev-1)); + return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); /* try another pass */ m->lastpos[lev] = sp; - dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); + dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); if (dp == NULL) - return(backref(m, sp, stop, ss+1, stopst, lev-1)); + return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); else return(dp); break; @@ -680,7 +712,7 @@ backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, esub = ss + OPND(s) - 1; assert(OP(m->g->strip[esub]) == OOR1); for (;;) { /* find first matching branch */ - dp = backref(m, sp, stop, ssub, esub, lev); + dp = backref(m, sp, stop, ssub, esub, lev, rec); if (dp != NULL) return(dp); /* that one missed, try next one */ @@ -701,7 +733,7 @@ backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, assert(0 < i && i <= m->g->nsub); offsave = m->pmatch[i].rm_so; m->pmatch[i].rm_so = sp - m->offp; - dp = backref(m, sp, stop, ss+1, stopst, lev); + dp = backref(m, sp, stop, ss+1, stopst, lev, rec); if (dp != NULL) return(dp); m->pmatch[i].rm_so = offsave; @@ -712,7 +744,7 @@ backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, assert(0 < i && i <= m->g->nsub); offsave = m->pmatch[i].rm_eo; m->pmatch[i].rm_eo = sp - m->offp; - dp = backref(m, sp, stop, ss+1, stopst, lev); + dp = backref(m, sp, stop, ss+1, stopst, lev, rec); if (dp != NULL) return(dp); m->pmatch[i].rm_eo = offsave; @@ -731,21 +763,26 @@ backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, /* - fast - step through the string at top speed - == static char *fast(struct match *m, char *start, \ - == char *stop, sopno startst, sopno stopst); + == static const char *fast(struct match *m, const char *start, \ + == const char *stop, sopno startst, sopno stopst); */ -static char * /* where tentative match ended, or NULL */ -fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) +static const char * /* where tentative match ended, or NULL */ +fast( struct match *m, + const char *start, + const char *stop, + sopno startst, + sopno stopst) { states st = m->st; states fresh = m->fresh; states tmp = m->tmp; - char *p = start; - int c = (start == m->beginp) ? OUT : *(start-1); - int lastc; /* previous c */ - int flagch; + const char *p = start; + wint_t c; + wint_t lastc; /* previous c */ + wint_t flagch; int i; - char *coldp; /* last p after which no match was underway */ + const char *coldp; /* last p after which no match was underway */ + size_t clen; CLEAR(st); SET1(st, startst); @@ -753,10 +790,24 @@ fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) ASSIGN(fresh, st); SP("start", st, *p); coldp = NULL; + if (start == m->beginp) + c = OUT; + else { + /* + * XXX Wrong if the previous character was multi-byte. + * Newline never is (in supported encodings), + * so this only breaks the ISWORD tests below. + */ + c = (uch)*(start - 1); + } for (;;) { /* next character */ lastc = c; - c = (p == m->endp) ? OUT : *p; + if (p == m->endp) { + clen = 0; + c = OUT; + } else + clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); if (EQ(st, fresh)) coldp = p; @@ -794,7 +845,7 @@ fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) } /* are we done? */ - if (ISSET(st, stopst) || p == stop) + if (ISSET(st, stopst) || p == stop || clen > stop - p) break; /* NOTE BREAK OUT */ /* no, we must deal with this character */ @@ -804,34 +855,39 @@ fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) st = step(m->g, startst, stopst, tmp, c, st); SP("aft", st, c); assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p++; + p += clen; } assert(coldp != NULL); m->coldp = coldp; if (ISSET(st, stopst)) - return(p+1); + return(p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); else return(NULL); } /* - slow - step through the string more deliberately - == static char *slow(struct match *m, char *start, \ - == char *stop, sopno startst, sopno stopst); + == static const char *slow(struct match *m, const char *start, \ + == const char *stop, sopno startst, sopno stopst); */ -static char * /* where it ended */ -slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) +static const char * /* where it ended */ +slow( struct match *m, + const char *start, + const char *stop, + sopno startst, + sopno stopst) { states st = m->st; states empty = m->empty; states tmp = m->tmp; - char *p = start; - int c = (start == m->beginp) ? OUT : *(start-1); - int lastc; /* previous c */ - int flagch; + const char *p = start; + wint_t c; + wint_t lastc; /* previous c */ + wint_t flagch; int i; - char *matchp; /* last p at which a match ended */ + const char *matchp; /* last p at which a match ended */ + size_t clen; AT("slow", start, stop, startst, stopst); CLEAR(st); @@ -839,10 +895,24 @@ slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) SP("sstart", st, *p); st = step(m->g, startst, stopst, st, NOTHING, st); matchp = NULL; + if (start == m->beginp) + c = OUT; + else { + /* + * XXX Wrong if the previous character was multi-byte. + * Newline never is (in supported encodings), + * so this only breaks the ISWORD tests below. + */ + c = (uch)*(start - 1); + } for (;;) { /* next character */ lastc = c; - c = (p == m->endp) ? OUT : *p; + if (p == m->endp) { + c = OUT; + clen = 0; + } else + clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); /* is there an EOL and/or BOL between lastc and c? */ flagch = '\0'; @@ -880,7 +950,7 @@ slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) /* are we done? */ if (ISSET(st, stopst)) matchp = p; - if (EQ(st, empty) || p == stop) + if (EQ(st, empty) || p == stop || clen > stop - p) break; /* NOTE BREAK OUT */ /* no, we must deal with this character */ @@ -890,7 +960,7 @@ slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) st = step(m->g, startst, stopst, tmp, c, st); SP("saft", st, c); assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p++; + p += clen; } return(matchp); @@ -901,28 +971,27 @@ slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) - step - map set of states reachable before char to set reachable after == static states step(struct re_guts *g, sopno start, sopno stop, \ == states bef, int ch, states aft); - == #define BOL (OUT+1) - == #define EOL (BOL+1) - == #define BOLEOL (BOL+2) - == #define NOTHING (BOL+3) - == #define BOW (BOL+4) - == #define EOW (BOL+5) - == #define CODEMAX (BOL+5) // highest code used - == #define NONCHAR(c) ((c) > CHAR_MAX) - == #define NNONCHAR (CODEMAX-CHAR_MAX) + == #define BOL (OUT-1) + == #define EOL (BOL-1) + == #define BOLEOL (BOL-2) + == #define NOTHING (BOL-3) + == #define BOW (BOL-4) + == #define EOW (BOL-5) + == #define BADCHAR (BOL-6) + == #define NONCHAR(c) ((c) <= OUT) */ static states step(struct re_guts *g, - sopno start, /* start state within strip */ - sopno stop, /* state after stop state within strip */ - states bef, /* states reachable before */ - int ch, /* character or NONCHAR code */ - states aft) /* states already known reachable after */ + sopno start, /* start state within strip */ + sopno stop, /* state after stop state within strip */ + states bef, /* states reachable before */ + wint_t ch, /* character or NONCHAR code */ + states aft) /* states already known reachable after */ { cset *cs; sop s; sopno pc; - onestate here; /* note, macros know this name */ + onestate here; /* note, macros know this name */ sopno look; int i; @@ -934,8 +1003,8 @@ step(struct re_guts *g, break; case OCHAR: /* only characters can match */ - assert(!NONCHAR(ch) || ch != (char)OPND(s)); - if (ch == (char)OPND(s)) + assert(!NONCHAR(ch) || ch != OPND(s)); + if (ch == OPND(s)) FWD(aft, bef, 1); break; case OBOL: @@ -1028,12 +1097,16 @@ step(struct re_guts *g, /* - print - print a set of states == #ifdef REDEBUG - == static void print(struct match *m, char *caption, states st, \ + == static void print(struct match *m, const char *caption, states st, \ == int ch, FILE *d); == #endif */ static void -print(struct match *m, char *caption, states st, int ch, FILE *d) +print(struct match *m, + const char *caption, + states st, + int ch, + FILE *d) { struct re_guts *g = m->g; int i; @@ -1056,13 +1129,17 @@ print(struct match *m, char *caption, states st, int ch, FILE *d) /* - at - print current situation == #ifdef REDEBUG - == static void at(struct match *m, char *title, char *start, char *stop, \ - == sopno startst, sopno stopst); + == static void at(struct match *m, const char *title, const char *start, \ + == const char *stop, sopno startst, sopno stopst); == #endif */ static void -at(struct match *m, char *title, char *start, char *stop, sopno startst, - sopno stopst) +at( struct match *m, + const char *title, + const char *start, + const char *stop, + sopno startst, + sopno stopst) { if (!(m->eflags®_TRACE)) return; @@ -1077,7 +1154,7 @@ at(struct match *m, char *title, char *start, char *stop, sopno startst, /* - pchar - make a character printable == #ifdef REDEBUG - == static char *pchar(int ch); + == static const char *pchar(int ch); == #endif * * Is this identical to regchar() over in debug.c? Well, yes. But a @@ -1085,7 +1162,7 @@ at(struct match *m, char *title, char *start, char *stop, sopno startst, * a matching debug.o, and this is convenient. It all disappears in * the non-debug compilation anyway, so it doesn't matter much. */ -static char * /* -> representation */ +static const char * /* -> representation */ pchar(int ch) { static char pbuf[10]; diff --git a/lib/libc/regex/re_format.7 b/lib/libc/regex/re_format.7 index 7e4e215859..7181e87192 100644 --- a/lib/libc/regex/re_format.7 +++ b/lib/libc/regex/re_format.7 @@ -34,7 +34,7 @@ .\" SUCH DAMAGE. .\" .\" @(#)re_format.7 8.3 (Berkeley) 3/20/94 -.\" $FreeBSD: src/lib/libc/regex/re_format.7,v 1.4.2.5 2002/01/22 12:40:10 ru Exp $ +.\" $FreeBSD: src/lib/libc/regex/re_format.7,v 1.12 2008/09/05 17:41:20 keramida Exp $ .\" $DragonFly: src/lib/libc/regex/re_format.7,v 1.3 2008/05/02 02:05:04 swildner Exp $ .\" .Dd March 20, 1994 @@ -233,7 +233,7 @@ sequence of characters of that collating element. The sequence is a single element of the bracket expression's list. A bracket expression containing a multi-character collating element can thus match more than one character, -e.g. if the collating sequence includes a +e.g.\& if the collating sequence includes a .Ql ch collating element, then the RE @@ -288,6 +288,14 @@ These stand for the character classes defined in A locale may provide others. A character class may not be used as an endpoint of a range. .Pp +A bracketed expression like +.Ql [[:class:]] +can be used to match a single character that belongs to a character +class. +The reverse, matching any character that does not belong to a specific +class, the negation operator of bracket expressions may be used: +.Ql [^[:class:]] . +.Pp There are two special cases\(dd of bracket expressions: the bracket expressions .Ql [[:<:]] diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c index 54c313476c..68d6e22c61 100644 --- a/lib/libc/regex/regcomp.c +++ b/lib/libc/regex/regcomp.c @@ -14,10 +14,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -35,27 +31,26 @@ * SUCH DAMAGE. * * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 - * - * $FreeBSD: src/lib/libc/regex/regcomp.c,v 1.13.2.2 2002/03/20 13:13:15 dcs Exp $ + * $FreeBSD: src/lib/libc/regex/regcomp.c,v 1.36 2007/06/11 03:05:54 delphij Exp $ * $DragonFly: src/lib/libc/regex/regcomp.c,v 1.7 2005/11/20 09:18:37 swildner Exp $ - * - * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 */ #include #include #include #include +#include #include #include #include +#include +#include #include "collate.h" #include "utils.h" #include "regex2.h" -#include "cclass.h" #include "cname.h" /* @@ -82,40 +77,30 @@ extern "C" { #endif /* === regcomp.c === */ -static void p_ere(struct parse *p, int stop); +static void p_ere(struct parse *p, wint_t stop); static void p_ere_exp(struct parse *p); static void p_str(struct parse *p); -static void p_bre(struct parse *p, int end1, int end2); +static void p_bre(struct parse *p, wint_t end1, wint_t end2); static int p_simp_re(struct parse *p, int starordinary); static int p_count(struct parse *p); static void p_bracket(struct parse *p); static void p_b_term(struct parse *p, cset *cs); static void p_b_cclass(struct parse *p, cset *cs); static void p_b_eclass(struct parse *p, cset *cs); -static char p_b_symbol(struct parse *p); -static char p_b_coll_elem(struct parse *p, int endc); -static char othercase(int ch); -static void bothcases(struct parse *p, int ch); -static void ordinary(struct parse *p, int ch); +static wint_t p_b_symbol(struct parse *p); +static wint_t p_b_coll_elem(struct parse *p, wint_t endc); +static wint_t othercase(wint_t ch); +static void bothcases(struct parse *p, wint_t ch); +static void ordinary(struct parse *p, wint_t ch); static void nonnewline(struct parse *p); static void repeat(struct parse *p, sopno start, int from, int to); static int seterr(struct parse *p, int e); static cset *allocset(struct parse *p); static void freeset(struct parse *p, cset *cs); -static int freezeset(struct parse *p, cset *cs); -static int firstch(struct parse *p, cset *cs); -static int nch(struct parse *p, cset *cs); -static void mcadd(struct parse *p, cset *cs, char *cp); -#if used -static void mcsub(cset *cs, char *cp); -static int mcin(cset *cs, char *cp); -static char *mcfind(cset *cs, char *cp); -#endif -static void mcinvert(struct parse *p, cset *cs); -static void mccase(struct parse *p, cset *cs); -static int isinsets(struct re_guts *g, int c); -static int samesets(struct re_guts *g, int c1, int c2); -static void categorize(struct parse *p, struct re_guts *g); +static void CHadd(struct parse *p, cset *cs, wint_t ch); +static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); +static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); +static wint_t singleton(cset *cs); static sopno dupl(struct parse *p, sopno start, sopno finish); static void doemit(struct parse *p, sop op, size_t opnd); static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); @@ -123,10 +108,11 @@ static void dofwd(struct parse *p, sopno pos, sop value); static void enlarge(struct parse *p, sopno size); static void stripsnug(struct parse *p, struct re_guts *g); static void findmust(struct parse *p, struct re_guts *g); -static int altoffset(sop *scan, int offset, int mccs); +static int altoffset(sop *scan, int offset); static void computejumps(struct parse *p, struct re_guts *g); static void computematchjumps(struct parse *p, struct re_guts *g); static sopno pluscount(struct parse *p, struct re_guts *g); +static wint_t wgetnext(struct parse *p); #ifdef __cplusplus } @@ -151,6 +137,7 @@ static char nuls[10]; /* place to point scanner in event of error */ #define NEXT2() (p->next += 2) #define NEXTn(n) (p->next += (n)) #define GETNEXT() (*p->next++) +#define WGETNEXT() wgetnext(p) #define SETERROR(e) seterr(p, (e)) #define REQUIRE(co, e) ((co) || SETERROR(e)) #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) @@ -187,7 +174,9 @@ static int never = 0; /* for use in asserts; shuts lint up */ = #define REG_DUMP 0200 */ int /* 0 success, otherwise REG_something */ -regcomp(regex_t *preg, const char *pattern, int cflags) +regcomp(regex_t * __restrict preg, + const char * __restrict pattern, + int cflags) { struct parse pa; struct re_guts *g; @@ -212,8 +201,7 @@ regcomp(regex_t *preg, const char *pattern, int cflags) len = strlen((char *)pattern); /* do the mallocs early so failure handling is easy */ - g = (struct re_guts *)malloc(sizeof(struct re_guts) + - (NC-1)*sizeof(cat_t)); + g = (struct re_guts *)malloc(sizeof(struct re_guts)); if (g == NULL) return(REG_ESPACE); p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ @@ -234,9 +222,7 @@ regcomp(regex_t *preg, const char *pattern, int cflags) p->pbegin[i] = 0; p->pend[i] = 0; } - g->csetsize = NC; g->sets = NULL; - g->setbits = NULL; g->ncsets = 0; g->cflags = cflags; g->iflags = 0; @@ -248,9 +234,6 @@ regcomp(regex_t *preg, const char *pattern, int cflags) g->matchjump = NULL; g->mlen = 0; g->nsub = 0; - g->ncategories = 1; /* category 0 is "everything else" */ - g->categories = &g->catspace[-(CHAR_MIN)]; - memset((char *)g->catspace, 0, NC*sizeof(cat_t)); g->backrefs = 0; /* do it */ @@ -266,7 +249,6 @@ regcomp(regex_t *preg, const char *pattern, int cflags) g->laststate = THERE(); /* tidy up loose ends and fill things in */ - categorize(p, g); stripsnug(p, g); findmust(p, g); /* only use Boyer-Moore algorithm if the pattern is bigger @@ -303,7 +285,7 @@ regcomp(regex_t *preg, const char *pattern, int cflags) */ static void p_ere(struct parse *p, - int stop) /* character this ERE should end at */ + int stop) /* character this ERE should end at */ { char c; sopno prevback; @@ -311,9 +293,6 @@ p_ere(struct parse *p, sopno conc; int first = 1; /* is this the first alternative? */ - prevback = 0; - prevfwd = 0; - for (;;) { /* do a bunch of concatenated expressions */ conc = HERE(); @@ -353,6 +332,7 @@ static void p_ere_exp(struct parse *p) { char c; + wint_t wc; sopno pos; int count; int count2; @@ -422,14 +402,16 @@ p_ere_exp(struct parse *p) break; case '\\': REQUIRE(MORE(), REG_EESCAPE); - c = GETNEXT(); - ordinary(p, c); + wc = WGETNEXT(); + ordinary(p, wc); break; case '{': /* okay as ordinary except if digit follows */ REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); /* FALLTHROUGH */ default: - ordinary(p, c); + p->next--; + wc = WGETNEXT(); + ordinary(p, wc); break; } @@ -502,7 +484,7 @@ p_str(struct parse *p) { REQUIRE(MORE(), REG_EMPTY); while (MORE()) - ordinary(p, GETNEXT()); + ordinary(p, WGETNEXT()); } /* @@ -512,15 +494,13 @@ p_str(struct parse *p) * Giving end1 as OUT essentially eliminates the end1/end2 check. * * This implementation is a bit of a kludge, in that a trailing $ is first - * taken as an ordinary character and then revised to be an anchor. The - * only undesirable side effect is that '$' gets included as a character - * category in such cases. This is fairly harmless; not worth fixing. + * taken as an ordinary character and then revised to be an anchor. * The amount of lookahead needed to avoid this kludge is excessive. */ static void p_bre(struct parse *p, - int end1, /* first terminating character */ - int end2) /* second terminating character */ + int end1, /* first terminating character */ + int end2) /* second terminating character */ { sopno start = HERE(); int first = 1; /* first subexpression? */ @@ -551,13 +531,14 @@ p_bre(struct parse *p, */ static int /* was the simple RE an unbackslashed $? */ p_simp_re(struct parse *p, - int starordinary) /* is a leading * an ordinary character? */ + int starordinary) /* is a leading * an ordinary character? */ { int c; int count; int count2; sopno pos; int i; + wint_t wc; sopno subno; # define BACKSL (1<next--; + wc = WGETNEXT(); + ordinary(p, wc); break; } @@ -684,15 +667,12 @@ p_count(struct parse *p) /* - p_bracket - parse a bracketed character list == static void p_bracket(struct parse *p); - * - * Note a significant property of this code: if the allocset() did SETERROR, - * no set operations are done. */ static void p_bracket(struct parse *p) { - cset *cs = allocset(p); - int invert = 0; + cset *cs; + wint_t ch; /* Dept of Truly Sickening Special-Case Kludges */ if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { @@ -706,55 +686,34 @@ p_bracket(struct parse *p) return; } + if ((cs = allocset(p)) == NULL) + return; + + if (p->g->cflags®_ICASE) + cs->icase = 1; if (EAT('^')) - invert++; /* make note to invert set at end */ + cs->invert = 1; if (EAT(']')) - CHadd(cs, ']'); + CHadd(p, cs, ']'); else if (EAT('-')) - CHadd(cs, '-'); + CHadd(p, cs, '-'); while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) p_b_term(p, cs); if (EAT('-')) - CHadd(cs, '-'); + CHadd(p, cs, '-'); MUSTEAT(']', REG_EBRACK); if (p->error != 0) /* don't mess things up further */ return; - if (p->g->cflags®_ICASE) { - int i; - int ci; + if (cs->invert && p->g->cflags®_NEWLINE) + cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); - for (i = p->g->csetsize - 1; i >= 0; i--) - if (CHIN(cs, i) && isalpha(i)) { - ci = othercase(i); - if (ci != i) - CHadd(cs, ci); - } - if (cs->multis != NULL) - mccase(p, cs); - } - if (invert) { - int i; - - for (i = p->g->csetsize - 1; i >= 0; i--) - if (CHIN(cs, i)) - CHsub(cs, i); - else - CHadd(cs, i); - if (p->g->cflags®_NEWLINE) - CHsub(cs, '\n'); - if (cs->multis != NULL) - mcinvert(p, cs); - } - - assert(cs->multis == NULL); /* xxx */ - - if (nch(p, cs) == 1) { /* optimize singleton sets */ - ordinary(p, firstch(p, cs)); + if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ + ordinary(p, ch); freeset(p, cs); } else - EMIT(OANYOF, freezeset(p, cs)); + EMIT(OANYOF, (int)(cs - p->g->sets)); } /* @@ -765,8 +724,8 @@ static void p_b_term(struct parse *p, cset *cs) { char c; - char start, finish; - int i; + wint_t start, finish; + wint_t i; /* classify what we've got */ switch ((MORE()) ? PEEK() : '\0') { @@ -802,7 +761,6 @@ p_b_term(struct parse *p, cset *cs) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); break; default: /* symbol, ordinary character, or range */ -/* xxx revision needed for multichar stuff */ start = p_b_symbol(p); if (SEE('-') && MORE2() && PEEK2() != ']') { /* range */ @@ -814,19 +772,18 @@ p_b_term(struct parse *p, cset *cs) } else finish = start; if (start == finish) - CHadd(cs, start); + CHadd(p, cs, start); else { if (__collate_load_error) { REQUIRE((uch)start <= (uch)finish, REG_ERANGE); - for (i = (uch)start; i <= (uch)finish; i++) - CHadd(cs, i); + CHaddrange(p, cs, start, finish); } else { REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); - for (i = CHAR_MIN; i <= CHAR_MAX; i++) { + for (i = 0; i <= UCHAR_MAX; i++) { if ( __collate_range_cmp(start, i) <= 0 && __collate_range_cmp(i, finish) <= 0 ) - CHadd(cs, i); + CHadd(p, cs, i); } } } @@ -841,89 +798,25 @@ p_b_term(struct parse *p, cset *cs) static void p_b_cclass(struct parse *p, cset *cs) { - int c; char *sp = p->next; - struct cclass *cp; size_t len; + wctype_t wct; + char clname[16]; while (MORE() && isalpha((uch)PEEK())) NEXT(); len = p->next - sp; - for (cp = cclasses; cp->name != NULL; cp++) - if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') - break; - if (cp->name == NULL) { - /* oops, didn't find it */ + if (len >= sizeof(clname) - 1) { SETERROR(REG_ECTYPE); return; } - - switch (cp->fidx) { - case CALNUM: - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (isalnum((uch)c)) - CHadd(cs, c); - break; - case CALPHA: - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (isalpha((uch)c)) - CHadd(cs, c); - break; - case CBLANK: - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (isblank((uch)c)) - CHadd(cs, c); - break; - case CCNTRL: - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (iscntrl((uch)c)) - CHadd(cs, c); - break; - case CDIGIT: - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (isdigit((uch)c)) - CHadd(cs, c); - break; - case CGRAPH: - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (isgraph((uch)c)) - CHadd(cs, c); - break; - case CLOWER: - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (islower((uch)c)) - CHadd(cs, c); - break; - case CPRINT: - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (isprint((uch)c)) - CHadd(cs, c); - break; - case CPUNCT: - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (ispunct((uch)c)) - CHadd(cs, c); - break; - case CSPACE: - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (isspace((uch)c)) - CHadd(cs, c); - break; - case CUPPER: - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (isupper((uch)c)) - CHadd(cs, c); - break; - case CXDIGIT: - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (isxdigit((uch)c)) - CHadd(cs, c); - break; + memcpy(clname, sp, len); + clname[len] = '\0'; + if ((wct = wctype(clname)) == 0) { + SETERROR(REG_ECTYPE); + return; } -#if 0 - for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) - MCadd(p, cs, u); -#endif + CHaddtype(p, cs, wct); } /* @@ -935,24 +828,24 @@ p_b_cclass(struct parse *p, cset *cs) static void p_b_eclass(struct parse *p, cset *cs) { - char c; + wint_t c; c = p_b_coll_elem(p, '='); - CHadd(cs, c); + CHadd(p, cs, c); } /* - p_b_symbol - parse a character or [..]ed multicharacter collating symbol == static char p_b_symbol(struct parse *p); */ -static char /* value of symbol */ +static wint_t /* value of symbol */ p_b_symbol(struct parse *p) { - char value; + wint_t value; REQUIRE(MORE(), REG_EBRACK); if (!EATTWO('[', '.')) - return(GETNEXT()); + return(WGETNEXT()); /* collating symbol */ value = p_b_coll_elem(p, '.'); @@ -964,13 +857,16 @@ p_b_symbol(struct parse *p) - p_b_coll_elem - parse a collating-element name and look it up == static char p_b_coll_elem(struct parse *p, int endc); */ -static char /* value of collating element */ +static wint_t /* value of collating element */ p_b_coll_elem(struct parse *p, - int endc) /* name ended by endc,']' */ + wint_t endc) /* name ended by endc,']' */ { char *sp = p->next; struct cname *cp; int len; + mbstate_t mbs; + wchar_t wc; + size_t clen; while (MORE() && !SEETWO(endc, ']')) NEXT(); @@ -982,9 +878,13 @@ p_b_coll_elem(struct parse *p, for (cp = cnames; cp->name != NULL; cp++) if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') return(cp->code); /* known name */ - if (len == 1) - return(*sp); /* single character */ - SETERROR(REG_ECOLLATE); /* neither */ + memset(&mbs, 0, sizeof(mbs)); + if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) + return (wc); /* single character */ + else if (clen == (size_t)-1 || clen == (size_t)-2) + SETERROR(REG_ILLSEQ); + else + SETERROR(REG_ECOLLATE); /* neither */ return(0); } @@ -992,15 +892,14 @@ p_b_coll_elem(struct parse *p, - othercase - return the case counterpart of an alphabetic == static char othercase(int ch); */ -static char /* if no counterpart, return ch */ -othercase(int ch) +static wint_t /* if no counterpart, return ch */ +othercase(wint_t ch) { - ch = (uch)ch; - assert(isalpha(ch)); - if (isupper(ch)) - return(tolower(ch)); - else if (islower(ch)) - return(toupper(ch)); + assert(iswalpha(ch)); + if (iswupper(ch)) + return(towlower(ch)); + else if (iswlower(ch)) + return(towupper(ch)); else /* peculiar, but could happen */ return(ch); } @@ -1012,21 +911,24 @@ othercase(int ch) * Boy, is this implementation ever a kludge... */ static void -bothcases(struct parse *p, int ch) +bothcases(struct parse *p, wint_t ch) { char *oldnext = p->next; char *oldend = p->end; - char bracket[3]; + char bracket[3 + MB_LEN_MAX]; + size_t n; + mbstate_t mbs; - ch = (uch)ch; assert(othercase(ch) != ch); /* p_bracket() would recurse */ p->next = bracket; - p->end = bracket+2; - bracket[0] = ch; - bracket[1] = ']'; - bracket[2] = '\0'; + memset(&mbs, 0, sizeof(mbs)); + n = wcrtomb(bracket, ch, &mbs); + assert(n != (size_t)-1); + bracket[n] = ']'; + bracket[n + 1] = '\0'; + p->end = bracket+n+1; p_bracket(p); - assert(p->next == bracket+2); + assert(p->next == p->end); p->next = oldnext; p->end = oldend; } @@ -1036,16 +938,23 @@ bothcases(struct parse *p, int ch) == static void ordinary(struct parse *p, int ch); */ static void -ordinary(struct parse *p, int ch) +ordinary(struct parse *p, wint_t ch) { - cat_t *cap = p->g->categories; + cset *cs; - if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) + if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) bothcases(p, ch); + else if ((ch & OPDMASK) == ch) + EMIT(OCHAR, ch); else { - EMIT(OCHAR, (uch)ch); - if (cap[ch] == 0) - cap[ch] = p->g->ncategories++; + /* + * Kludge: character is too big to fit into an OCHAR operand. + * Emit a singleton set. + */ + if ((cs = allocset(p)) == NULL) + return; + CHadd(p, cs, ch); + EMIT(OANYOF, (int)(cs - p->g->sets)); } } @@ -1080,9 +989,9 @@ nonnewline(struct parse *p) */ static void repeat(struct parse *p, - sopno start, /* operand from here to end of strip */ - int from, /* repeated from this number */ - int to) /* to this number of times (maybe INFINITY) */ + sopno start, /* operand from here to end of strip */ + int from, /* repeated from this number */ + int to) /* to this number of times (maybe INFINITY) */ { sopno finish = HERE(); # define N 2 @@ -1145,6 +1054,30 @@ repeat(struct parse *p, } } +/* + - wgetnext - helper function for WGETNEXT() macro. Gets the next wide + - character from the parse struct, signals a REG_ILLSEQ error if the + - character can't be converted. Returns the number of bytes consumed. + */ +static wint_t +wgetnext(struct parse *p) +{ + mbstate_t mbs; + wchar_t wc; + size_t n; + + memset(&mbs, 0, sizeof(mbs)); + n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); + if (n == (size_t)-1 || n == (size_t)-2) { + SETERROR(REG_ILLSEQ); + return (0); + } + if (n == 0) + n = 1; + p->next += n; + return (wc); +} + /* - seterr - set an error condition == static int seterr(struct parse *p, int e); @@ -1166,48 +1099,16 @@ seterr(struct parse *p, int e) static cset * allocset(struct parse *p) { - int no = p->g->ncsets++; - size_t nc; - size_t nbytes; - cset *cs; - size_t css = (size_t)p->g->csetsize; - int i; + cset *cs, *ncs; - if (no >= p->ncsalloc) { /* need another column of space */ - p->ncsalloc += CHAR_BIT; - nc = p->ncsalloc; - assert(nc % CHAR_BIT == 0); - nbytes = nc / CHAR_BIT * css; - if (p->g->sets == NULL) - p->g->sets = (cset *)malloc(nc * sizeof(cset)); - else - p->g->sets = (cset *)reallocf((char *)p->g->sets, - nc * sizeof(cset)); - if (p->g->setbits == NULL) - p->g->setbits = (uch *)malloc(nbytes); - else { - p->g->setbits = (uch *)reallocf((char *)p->g->setbits, - nbytes); - /* xxx this isn't right if setbits is now NULL */ - for (i = 0; i < no; i++) - p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); - } - if (p->g->sets != NULL && p->g->setbits != NULL) - memset((char *)p->g->setbits + (nbytes - css), 0, css); - else { - no = 0; - SETERROR(REG_ESPACE); - /* caller's responsibility not to do set ops */ - } + ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); + if (ncs == NULL) { + SETERROR(REG_ESPACE); + return (NULL); } - - assert(p->g->sets != NULL); /* xxx */ - cs = &p->g->sets[no]; - cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); - cs->mask = 1 << ((no) % CHAR_BIT); - cs->hash = 0; - cs->smultis = 0; - cs->multis = NULL; + p->g->sets = ncs; + cs = &p->g->sets[p->g->ncsets++]; + memset(cs, 0, sizeof(*cs)); return(cs); } @@ -1219,252 +1120,111 @@ allocset(struct parse *p) static void freeset(struct parse *p, cset *cs) { - int i; cset *top = &p->g->sets[p->g->ncsets]; - size_t css = (size_t)p->g->csetsize; - for (i = 0; i < css; i++) - CHsub(cs, i); + free(cs->wides); + free(cs->ranges); + free(cs->types); + memset(cs, 0, sizeof(*cs)); if (cs == top-1) /* recover only the easy case */ p->g->ncsets--; } /* - - freezeset - final processing on a set of characters - == static int freezeset(struct parse *p, cset *cs); - * - * The main task here is merging identical sets. This is usually a waste - * of time (although the hash code minimizes the overhead), but can win - * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash - * is done using addition rather than xor -- all ASCII [aA] sets xor to - * the same value! - */ -static int /* set number */ -freezeset(struct parse *p, cset *cs) -{ - short h = cs->hash; - int i; - cset *top = &p->g->sets[p->g->ncsets]; - cset *cs2; - size_t css = (size_t)p->g->csetsize; - - /* look for an earlier one which is the same */ - for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) - if (cs2->hash == h && cs2 != cs) { - /* maybe */ - for (i = 0; i < css; i++) - if (!!CHIN(cs2, i) != !!CHIN(cs, i)) - break; /* no */ - if (i == css) - break; /* yes */ - } - - if (cs2 < top) { /* found one */ - freeset(p, cs); - cs = cs2; - } - - return((int)(cs - p->g->sets)); -} - -/* - - firstch - return first character in a set (which must have at least one) - == static int firstch(struct parse *p, cset *cs); + - singleton - Determine whether a set contains only one character, + - returning it if so, otherwise returning OUT. */ -static int /* character; there is no "none" value */ -firstch(struct parse *p, cset *cs) +static wint_t +singleton(cset *cs) { - int i; - size_t css = (size_t)p->g->csetsize; - - for (i = 0; i < css; i++) - if (CHIN(cs, i)) - return((char)i); - assert(never); - return(0); /* arbitrary */ -} + wint_t i, s, n; -/* - - nch - number of characters in a set - == static int nch(struct parse *p, cset *cs); - */ -static int -nch(struct parse *p, cset *cs) -{ - int i; - size_t css = (size_t)p->g->csetsize; - int n = 0; - - for (i = 0; i < css; i++) - if (CHIN(cs, i)) + for (i = n = 0; i < NC; i++) + if (CHIN(cs, i)) { n++; - return(n); + s = i; + } + if (n == 1) + return (s); + if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && + cs->icase == 0) + return (cs->wides[0]); + /* Don't bother handling the other cases. */ + return (OUT); } /* - - mcadd - add a collating element to a cset - == static void mcadd(struct parse *p, cset *cs, \ - == char *cp); + - CHadd - add character to character set. */ static void -mcadd(struct parse *p, cset *cs, char *cp) +CHadd(struct parse *p, cset *cs, wint_t ch) { - size_t oldend = cs->smultis; - - cs->smultis += strlen(cp) + 1; - if (cs->multis == NULL) - cs->multis = malloc(cs->smultis); - else - cs->multis = reallocf(cs->multis, cs->smultis); - if (cs->multis == NULL) { - SETERROR(REG_ESPACE); - return; + wint_t nch, *newwides; + assert(ch >= 0); + if (ch < NC) + cs->bmp[ch >> 3] |= 1 << (ch & 7); + else { + newwides = realloc(cs->wides, (cs->nwides + 1) * + sizeof(*cs->wides)); + if (newwides == NULL) { + SETERROR(REG_ESPACE); + return; + } + cs->wides = newwides; + cs->wides[cs->nwides++] = ch; + } + if (cs->icase) { + if ((nch = towlower(ch)) < NC) + cs->bmp[nch >> 3] |= 1 << (nch & 7); + if ((nch = towupper(ch)) < NC) + cs->bmp[nch >> 3] |= 1 << (nch & 7); } - - strcpy(cs->multis + oldend - 1, cp); - cs->multis[cs->smultis - 1] = '\0'; } -#if used /* - - mcsub - subtract a collating element from a cset - == static void mcsub(cset *cs, char *cp); + - CHaddrange - add all characters in the range [min,max] to a character set. */ static void -mcsub(cset *cs, char *cp) +CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) { - char *fp = mcfind(cs, cp); - size_t len = strlen(fp); - - assert(fp != NULL); - memmove(fp, fp + len + 1, cs->smultis - (fp + len + 1 - cs->multis)); - cs->smultis -= len; + crange *newranges; - if (cs->smultis == 0) { - free(cs->multis); - cs->multis = NULL; + for (; min < NC && min <= max; min++) + CHadd(p, cs, min); + if (min >= max) + return; + newranges = realloc(cs->ranges, (cs->nranges + 1) * + sizeof(*cs->ranges)); + if (newranges == NULL) { + SETERROR(REG_ESPACE); return; } - - cs->multis = reallocf(cs->multis, cs->smultis); - assert(cs->multis != NULL); -} - -/* - - mcin - is a collating element in a cset? - == static int mcin(cset *cs, char *cp); - */ -static int -mcin(cset *cs, char *cp) -{ - return(mcfind(cs, cp) != NULL); -} - -/* - - mcfind - find a collating element in a cset - == static char *mcfind(cset *cs, char *cp); - */ -static char * -mcfind(cset *cs, char *cp) -{ - char *p; - - if (cs->multis == NULL) - return(NULL); - for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) - if (strcmp(cp, p) == 0) - return(p); - return(NULL); + cs->ranges = newranges; + cs->ranges[cs->nranges].min = min; + cs->ranges[cs->nranges].min = max; + cs->nranges++; } -#endif /* - - mcinvert - invert the list of collating elements in a cset - == static void mcinvert(struct parse *p, cset *cs); - * - * This would have to know the set of possibilities. Implementation - * is deferred. + - CHaddtype - add all characters of a certain type to a character set. */ static void -mcinvert(struct parse *p __unused, cset *cs) +CHaddtype(struct parse *p, cset *cs, wctype_t wct) { - assert(cs->multis == NULL); /* xxx */ -} - -/* - - mccase - add case counterparts of the list of collating elements in a cset - == static void mccase(struct parse *p, cset *cs); - * - * This would have to know the set of possibilities. Implementation - * is deferred. - */ -static void -mccase(struct parse *p __unused, cset *cs) -{ - assert(cs->multis == NULL); /* xxx */ -} - -/* - - isinsets - is this character in any sets? - == static int isinsets(struct re_guts *g, int c); - */ -static int /* predicate */ -isinsets(struct re_guts *g, int c) -{ - uch *col; - int i; - int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; - unsigned uc = (uch)c; - - for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) - if (col[uc] != 0) - return(1); - return(0); -} - -/* - - samesets - are these two characters in exactly the same sets? - == static int samesets(struct re_guts *g, int c1, int c2); - */ -static int /* predicate */ -samesets(struct re_guts *g, int c1, int c2) -{ - uch *col; - int i; - int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; - unsigned uc1 = (uch)c1; - unsigned uc2 = (uch)c2; - - for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) - if (col[uc1] != col[uc2]) - return(0); - return(1); -} - -/* - - categorize - sort out character categories - == static void categorize(struct parse *p, struct re_guts *g); - */ -static void -categorize(struct parse *p, struct re_guts *g) -{ - cat_t *cats = g->categories; - int c; - int c2; - cat_t cat; - - /* avoid making error situations worse */ - if (p->error != 0) + wint_t i; + wctype_t *newtypes; + + for (i = 0; i < NC; i++) + if (iswctype(i, wct)) + CHadd(p, cs, i); + newtypes = realloc(cs->types, (cs->ntypes + 1) * + sizeof(*cs->types)); + if (newtypes == NULL) { + SETERROR(REG_ESPACE); return; - - for (c = CHAR_MIN; c <= CHAR_MAX; c++) - if (cats[c] == 0 && isinsets(g, c)) { - cat = g->ncategories++; - cats[c] = cat; - for (c2 = c+1; c2 <= CHAR_MAX; c2++) - if (cats[c2] == 0 && samesets(g, c, c2)) - cats[c2] = cat; - } + } + cs->types = newtypes; + cs->types[cs->ntypes++] = wct; } /* @@ -1473,8 +1233,8 @@ categorize(struct parse *p, struct re_guts *g) */ static sopno /* start of duplicate */ dupl(struct parse *p, - sopno start, /* from here */ - sopno finish) /* to this less one */ + sopno start, /* from here */ + sopno finish) /* to this less one */ { sopno ret = HERE(); sopno len = finish - start; @@ -1484,8 +1244,8 @@ dupl(struct parse *p, return(ret); enlarge(p, p->ssize + len); /* this many unexpected additions */ assert(p->ssize >= p->slen + len); - memcpy((char *)(p->strip + p->slen), (char *)(p->strip + start), - (size_t)len*sizeof(sop)); + memcpy((char *)(p->strip + p->slen), + (char *)(p->strip + start), (size_t)len*sizeof(sop)); p->slen += len; return(ret); } @@ -1623,22 +1383,23 @@ findmust(struct parse *p, struct re_guts *g) sopno newlen; sop s; char *cp; - sopno i; int offset; - int cs, mccs; + char buf[MB_LEN_MAX]; + size_t clen; + mbstate_t mbs; - start = NULL; - newstart = NULL; - /* avoid making error situations worse */ if (p->error != 0) return; - /* Find out if we can handle OANYOF or not */ - mccs = 0; - for (cs = 0; cs < g->ncsets; cs++) - if (g->sets[cs].multis != NULL) - mccs = 1; + /* + * It's not generally safe to do a ``char'' substring search on + * multibyte character strings, but it's safe for at least + * UTF-8 (see RFC 3629). + */ + if (MB_CUR_MAX > 1 && + strcmp(nl_langinfo(CODESET), "UTF-8") != 0) + return; /* find the longest OCHAR sequence in strip */ newlen = 0; @@ -1649,9 +1410,14 @@ findmust(struct parse *p, struct re_guts *g) s = *scan++; switch (OP(s)) { case OCHAR: /* sequence member */ - if (newlen == 0) /* new sequence */ + if (newlen == 0) { /* new sequence */ + memset(&mbs, 0, sizeof(mbs)); newstart = scan - 1; - newlen++; + } + clen = wcrtomb(buf, OPND(s), &mbs); + if (clen == (size_t)-1) + goto toohard; + newlen += clen; break; case OPLUS_: /* things that don't break one */ case OLPAREN: @@ -1659,7 +1425,7 @@ findmust(struct parse *p, struct re_guts *g) break; case OQUEST_: /* things that must be skipped */ case OCH_: - offset = altoffset(scan, offset, mccs); + offset = altoffset(scan, offset); scan--; do { scan += OPND(s); @@ -1671,7 +1437,7 @@ findmust(struct parse *p, struct re_guts *g) return; } } while (OP(s) != O_QUEST && OP(s) != O_CH); - /* fallthrough */ + /* FALLTHROUGH */ case OBOW: /* things that break a sequence */ case OEOW: case OBOL: @@ -1727,12 +1493,8 @@ findmust(struct parse *p, struct re_guts *g) if (offset > -1) offset++; newlen = 0; - /* And, now, if we found out we can't deal with - * it, make offset = -1. - */ - if (mccs) - offset = -1; break; + toohard: default: /* Anything here makes it impossible or too hard * to calculate the offset -- so we give up; @@ -1767,11 +1529,13 @@ findmust(struct parse *p, struct re_guts *g) } cp = g->must; scan = start; - for (i = g->mlen; i > 0; i--) { + memset(&mbs, 0, sizeof(mbs)); + while (cp < g->must + g->mlen) { while (OP(s = *scan++) != OCHAR) continue; - assert(cp < g->must + g->mlen); - *cp++ = (char)OPND(s); + clen = wcrtomb(cp, OPND(s), &mbs); + assert(clen != (size_t)-1); + cp += clen; } assert(cp == g->must + g->mlen); *cp++ = '\0'; /* just on general principles */ @@ -1779,13 +1543,13 @@ findmust(struct parse *p, struct re_guts *g) /* - altoffset - choose biggest offset among multiple choices - == static int altoffset(sop *scan, int offset, int mccs); + == static int altoffset(sop *scan, int offset); * * Compute, recursively if necessary, the largest offset among multiple * re paths. */ static int -altoffset(sop *scan, int offset, int mccs) +altoffset(sop *scan, int offset) { int largest; int try; @@ -1807,7 +1571,7 @@ altoffset(sop *scan, int offset, int mccs) break; case OQUEST_: case OCH_: - try = altoffset(scan, try, mccs); + try = altoffset(scan, try); if (try == -1) return -1; scan--; @@ -1824,8 +1588,6 @@ altoffset(sop *scan, int offset, int mccs) scan++; break; case OANYOF: - if (mccs) - return -1; case OCHAR: case OANY: try++; @@ -1889,7 +1651,7 @@ computejumps(struct parse *p, struct re_guts *g) * is the first one that would be matched). */ for (mindex = 0; mindex < g->mlen; mindex++) - g->charjump[g->must[mindex]] = g->mlen - mindex - 1; + g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; } /* diff --git a/lib/libc/regex/regerror.c b/lib/libc/regex/regerror.c index 1cf205e3f2..51ae7c2421 100644 --- a/lib/libc/regex/regerror.c +++ b/lib/libc/regex/regerror.c @@ -14,10 +14,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -35,8 +31,7 @@ * SUCH DAMAGE. * * @(#)regerror.c 8.4 (Berkeley) 3/20/94 - * - * @(#)regerror.c 8.4 (Berkeley) 3/20/94 + * $FreeBSD: src/lib/libc/regex/regerror.c,v 1.11 2007/06/11 03:05:54 delphij Exp $ * $DragonFly: src/lib/libc/regex/regerror.c,v 1.6 2005/11/13 02:17:18 swildner Exp $ */ @@ -78,6 +73,7 @@ static char *regatoi(const regex_t *preg, char *localbuf); = #define REG_EMPTY 14 = #define REG_ASSERT 15 = #define REG_INVARG 16 + = #define REG_ILLSEQ 17 = #define REG_ATOI 255 // convert name to number (!) = #define REG_ITOA 0400 // convert number to name (!) */ @@ -102,6 +98,7 @@ static struct rerr { {REG_EMPTY, "REG_EMPTY", "empty (sub)expression"}, {REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug"}, {REG_INVARG, "REG_INVARG", "invalid argument to regex routine"}, + {REG_ILLSEQ, "REG_ILLSEQ", "illegal byte sequence"}, {0, "", "*** unknown regexp error code ***"} }; @@ -111,7 +108,10 @@ static struct rerr { */ /* ARGSUSED */ size_t -regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) +regerror(int errcode, + const regex_t * __restrict preg, + char * __restrict errbuf, + size_t errbuf_size) { struct rerr *r; size_t len; diff --git a/lib/libc/regex/regex.3 b/lib/libc/regex/regex.3 index 9c7db74505..a11afc82ec 100644 --- a/lib/libc/regex/regex.3 +++ b/lib/libc/regex/regex.3 @@ -13,10 +13,6 @@ .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. .\" 4. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. @@ -34,10 +30,10 @@ .\" SUCH DAMAGE. .\" .\" @(#)regex.3 8.4 (Berkeley) 3/20/94 -.\" $FreeBSD: src/lib/libc/regex/regex.3,v 1.4.2.4 2001/12/14 18:33:56 ru Exp $ +.\" $FreeBSD: src/lib/libc/regex/regex.3,v 1.21 2007/01/09 00:28:04 imp Exp $ .\" $DragonFly: src/lib/libc/regex/regex.3,v 1.5 2008/06/02 06:50:08 hasso Exp $ .\" -.Dd March 20, 1994 +.Dd August 17, 2005 .Dt REGEX 3 .Os .Sh NAME @@ -51,16 +47,18 @@ .Sh SYNOPSIS .In regex.h .Ft int -.Fn regcomp "regex_t *preg" "const char *pattern" "int cflags" +.Fo regcomp +.Fa "regex_t * restrict preg" "const char * restrict pattern" "int cflags" +.Fc .Ft int .Fo regexec -.Fa "const regex_t *preg" "const char *string" -.Fa "size_t nmatch" "regmatch_t pmatch[]" "int eflags" +.Fa "const regex_t * restrict preg" "const char * restrict string" +.Fa "size_t nmatch" "regmatch_t pmatch[restrict]" "int eflags" .Fc .Ft size_t .Fo regerror -.Fa "int errcode" "const regex_t *preg" -.Fa "char *errbuf" "size_t errbuf_size" +.Fa "int errcode" "const regex_t * restrict preg" +.Fa "char * restrict errbuf" "size_t errbuf_size" .Fc .Ft void .Fn regfree "regex_t *preg" @@ -71,7 +69,9 @@ regular expressions .Pq Do RE Dc Ns s ; see .Xr re_format 7 . -.Fn Regcomp +The +.Fn regcomp +function compiles an RE written as a string into an internal form, .Fn regexec matches that internal form against a string and reports results, @@ -95,7 +95,9 @@ a type and a number of constants with names starting with .Dq Dv REG_ . .Pp -.Fn Regcomp +The +.Fn regcomp +function compiles the regular expression contained in the .Fa pattern string, @@ -105,7 +107,9 @@ and places the results in the .Ft regex_t structure pointed to by .Fa preg . -.Fa Cflags +The +.Fa cflags +argument is the bitwise OR of zero or more of the following flags: .Bl -tag -width REG_EXTENDED .It Dv REG_EXTENDED @@ -203,7 +207,9 @@ fails, it returns a non-zero error code; see .Sx DIAGNOSTICS . .Pp -.Fn Regexec +The +.Fn regexec +function matches the compiled RE pointed to by .Fa preg against the @@ -389,7 +395,9 @@ the value of will not be changed by a successful .Fn regexec . .Pp -.Fn Regerror +The +.Fn regerror +function maps a non-zero .Fa errcode from either @@ -412,17 +420,20 @@ it should have been the result from the most recent .Fn regcomp using that .Ft regex_t . -.No ( Fn Regerror +The +.Fn ( regerror may be able to supply a more detailed message using information from the .Ft regex_t . ) -.Fn Regerror +The +.Fn regerror +function places the NUL-terminated message into the buffer pointed to by .Fa errbuf , limiting the length (including the NUL) to at most .Fa errbuf_size bytes. -If the whole message won't fit, +If the whole message will not fit, as much of it as will fit before the terminating NUL is supplied. In any case, the returned value is the size of buffer needed to hold the whole @@ -473,7 +484,9 @@ and should be used with caution in software intended to be portable to other systems. Be warned also that they are considered experimental and changes are possible. .Pp -.Fn Regfree +The +.Fn regfree +function frees any dynamically-allocated storage associated with the compiled RE pointed to by .Fa preg . @@ -548,7 +561,7 @@ or .Ql |\& cannot appear first or last in a (sub)expression or after another .Ql |\& , -i.e. an operand of +i.e., an operand of .Ql |\& cannot be an empty subexpression. An empty parenthesized subexpression, @@ -581,7 +594,9 @@ include the following: .Pp .Bl -tag -width REG_ECOLLATE -compact .It Dv REG_NOMATCH +The .Fn regexec +function failed to match .It Dv REG_BADPAT invalid regular expression @@ -623,9 +638,11 @@ operand invalid .It Dv REG_EMPTY empty (sub)expression .It Dv REG_ASSERT -can't happen - you found a bug +cannot happen - you found a bug .It Dv REG_INVARG -invalid argument, e.g. negative-length string +invalid argument, e.g.\& negative-length string +.It Dv REG_ILLSEQ +illegal byte sequence (bad multibyte character) .El .Sh SEE ALSO .Xr grep 1 , @@ -648,14 +665,20 @@ Please report problems. The back-reference code is subtle and doubts linger about its correctness in complex cases. .Pp -.Fn Regexec +The +.Fn regexec +function performance is poor. This will improve with later releases. -.Fa Nmatch +The +.Fa nmatch +argument exceeding 0 is expensive; .Fa nmatch exceeding 1 is worse. -.Fn Regexec +The +.Fn regexec +function is largely insensitive to RE complexity .Em except that back @@ -664,7 +687,9 @@ RE length does matter; in particular, there is a strong speed bonus for keeping RE length under about 30 characters, with most special characters counting roughly double. .Pp -.Fn Regcomp +The +.Fn regcomp +function implements bounded repetitions by macro expansion, which is costly in time and space if counts are large or bounded repetitions are nested. @@ -687,7 +712,7 @@ are legal REs because is a special character only in the presence of a previous unmatched .Ql (\& . -This can't be fixed until the spec is fixed. +This cannot be fixed until the spec is fixed. .Pp The standard's definition of back references is vague. For example, does @@ -699,3 +724,5 @@ behavior in such cases should not be relied on. .Pp The implementation of word-boundary matching is a bit of a kludge, and bugs may lurk in combinations of word-boundary matching and anchoring. +.Pp +Word-boundary matching does not work properly in multibyte locales. diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h index a1c7444129..1c394959c6 100644 --- a/lib/libc/regex/regex2.h +++ b/lib/libc/regex/regex2.h @@ -14,10 +14,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -35,8 +31,7 @@ * SUCH DAMAGE. * * @(#)regex2.h 8.4 (Berkeley) 3/20/94 - * - * $FreeBSD: src/lib/libc/regex/regex2.h,v 1.3.6.1 2000/07/31 06:30:37 dcs Exp $ + * $FreeBSD: src/lib/libc/regex/regex2.h,v 1.11 2007/01/09 00:28:04 imp Exp $ * $DragonFly: src/lib/libc/regex/regex2.h,v 1.2 2003/06/17 04:26:44 dillon Exp $ */ @@ -89,7 +84,7 @@ typedef long sopno; /* operators meaning operand */ /* (back, fwd are offsets) */ #define OEND (1L< uch [csetsize] */ - uch mask; /* bit within array */ - short hash; /* hash code */ - size_t smultis; - char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */ + wint_t min; + wint_t max; +} crange; +typedef struct { + unsigned char bmp[NC / 8]; + wctype_t *types; + int ntypes; + wint_t *wides; + int nwides; + crange *ranges; + int nranges; + int invert; + int icase; } cset; -/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */ -#define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (uch)(c)) -#define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (uch)(c)) -#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask) -#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal fns */ -#define MCsub(p, cs, cp) mcsub(p, cs, cp) -#define MCin(p, cs, cp) mcin(p, cs, cp) -/* stuff for character categories */ -typedef unsigned char cat_t; +static int +CHIN1(cset *cs, wint_t ch) +{ + int i; + + assert(ch >= 0); + if (ch < NC) + return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ + cs->invert); + for (i = 0; i < cs->nwides; i++) + if (ch == cs->wides[i]) + return (!cs->invert); + for (i = 0; i < cs->nranges; i++) + if (cs->ranges[i].min <= ch && ch <= cs->ranges[i].max) + return (!cs->invert); + for (i = 0; i < cs->ntypes; i++) + if (iswctype(ch, cs->types[i])) + return (!cs->invert); + return (cs->invert); +} + +static __inline int +CHIN(cset *cs, wint_t ch) +{ + + assert(ch >= 0); + if (ch < NC) + return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ + cs->invert); + else if (cs->icase) + return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) || + CHIN1(cs, towupper(ch))); + else + return (CHIN1(cs, ch)); +} /* * main compiled-expression structure @@ -146,10 +166,8 @@ struct re_guts { int magic; # define MAGIC2 ((('R'^0200)<<8)|'E') sop *strip; /* malloced area for strip */ - int csetsize; /* number of bits in a cset vector */ int ncsets; /* number of csets in use */ cset *sets; /* -> cset [ncsets] */ - uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */ int cflags; /* copy of regcomp() cflags argument */ sopno nstates; /* = number of sops */ sopno firststate; /* the initial OEND (normally 0) */ @@ -160,8 +178,6 @@ struct re_guts { # define BAD 04 /* something wrong */ int nbol; /* number of ^ used */ int neol; /* number of $ used */ - int ncategories; /* how many character categories */ - cat_t *categories; /* ->catspace[-CHAR_MIN] */ char *must; /* match must contain this string */ int moffset; /* latest point at which must may be located */ int *charjump; /* Boyer-Moore char jump table */ @@ -170,10 +186,8 @@ struct re_guts { size_t nsub; /* copy of re_nsub */ int backrefs; /* does it use back references? */ sopno nplus; /* how deep does it nest +s? */ - /* catspace must be last */ - cat_t catspace[1]; /* actually [NC] */ }; /* misc utilities */ -#define OUT (CHAR_MAX+1) /* a non-character value */ -#define ISWORD(c) (isalnum((uch)(c)) || (c) == '_') +#define OUT (CHAR_MIN - 1) /* a non-character value */ +#define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_') diff --git a/lib/libc/regex/regexec.c b/lib/libc/regex/regexec.c index d7567fc983..f8869163fe 100644 --- a/lib/libc/regex/regexec.c +++ b/lib/libc/regex/regexec.c @@ -14,10 +14,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -35,18 +31,16 @@ * SUCH DAMAGE. * * @(#)regexec.c 8.3 (Berkeley) 3/20/94 - * - * @(#)regexec.c 8.3 (Berkeley) 3/20/94 - * + * $FreeBSD: src/lib/libc/regex/regexec.c,v 1.8 2007/06/11 03:05:54 delphij Exp $ * $DragonFly: src/lib/libc/regex/regexec.c,v 1.4 2005/11/20 09:18:37 swildner Exp $ */ /* * the outer shell of regexec() * - * This file includes engine.c *twice*, after muchos fiddling with the + * This file includes engine.c three times, after muchos fiddling with the * macros that code uses. This lets the same code operate on two different - * representations for state sets. + * representations for state sets and characters. */ #include #include @@ -55,10 +49,47 @@ #include #include #include +#include +#include #include "utils.h" #include "regex2.h" +static int nope __unused = 0; /* for use in asserts; shuts lint up */ + +static __inline size_t +xmbrtowc(wint_t *wi, const char *s, size_t n, mbstate_t *mbs, wint_t dummy) +{ + size_t nr; + wchar_t wc; + + nr = mbrtowc(&wc, s, n, mbs); + if (wi != NULL) + *wi = wc; + if (nr == 0) + return (1); + else if (nr == (size_t)-1 || nr == (size_t)-2) { + memset(mbs, 0, sizeof(*mbs)); + if (wi != NULL) + *wi = dummy; + return (1); + } else + return (nr); +} + +static __inline size_t +xmbrtowc_dummy(wint_t *wi, + const char *s, + size_t n __unused, + mbstate_t *mbs __unused, + wint_t dummy __unused) +{ + + if (wi != NULL) + *wi = (unsigned char)*s; + return (1); +} + /* macros for manipulating states, small version */ #define states long #define states1 states /* for later use in regexec() decision */ @@ -81,6 +112,9 @@ #define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n)) #define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n)) #define ISSETBACK(v, n) (((v) & ((unsigned long)here >> (n))) != 0) +/* no multibyte support */ +#define XMBRTOWC xmbrtowc_dummy +#define ZAPSTATE(mbs) ((mbs)) /* function names */ #define SNAMES /* engine.c looks after details */ @@ -106,6 +140,8 @@ #undef BACK #undef ISSETBACK #undef SNAMES +#undef XMBRTOWC +#undef ZAPSTATE /* macros for manipulating states, large version */ #define states char * @@ -130,11 +166,24 @@ #define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here]) #define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here]) #define ISSETBACK(v, n) ((v)[here - (n)]) +/* no multibyte support */ +#define XMBRTOWC xmbrtowc_dummy +#define ZAPSTATE(mbs) ((mbs)) /* function names */ #define LNAMES /* flag */ #include "engine.c" +/* multibyte character & large states version */ +#undef LNAMES +#undef XMBRTOWC +#undef ZAPSTATE +#define XMBRTOWC xmbrtowc +#define ZAPSTATE(mbs) memset((mbs), 0, sizeof(*(mbs))) +#define MNAMES + +#include "engine.c" + /* - regexec - interface for matching = extern int regexec(const regex_t *, const char *, size_t, \ @@ -151,8 +200,11 @@ * have been prototyped. */ int /* 0 success, REG_NOMATCH failure */ -regexec(const regex_t *preg, const char *string, size_t nmatch, - regmatch_t pmatch[], int eflags) +regexec(const regex_t * __restrict preg, + const char * __restrict string, + size_t nmatch, + regmatch_t pmatch[__restrict], + int eflags) { struct re_guts *g = preg->re_g; #ifdef REDEBUG @@ -168,7 +220,9 @@ regexec(const regex_t *preg, const char *string, size_t nmatch, return(REG_BADPAT); eflags = GOODFLAGS(eflags); - if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags®_LARGE)) + if (MB_CUR_MAX > 1) + return(mmatcher(g, (char *)string, nmatch, pmatch, eflags)); + else if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags®_LARGE)) return(smatcher(g, (char *)string, nmatch, pmatch, eflags)); else return(lmatcher(g, (char *)string, nmatch, pmatch, eflags)); diff --git a/lib/libc/regex/regfree.c b/lib/libc/regex/regfree.c index e03264610c..d39bce3a6b 100644 --- a/lib/libc/regex/regfree.c +++ b/lib/libc/regex/regfree.c @@ -14,10 +14,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -35,11 +31,8 @@ * SUCH DAMAGE. * * @(#)regfree.c 8.3 (Berkeley) 3/20/94 - * - * $FreeBSD: src/lib/libc/regex/regfree.c,v 1.1.1.1.14.1 2000/07/31 06:30:37 dcs Exp $ + * $FreeBSD: src/lib/libc/regex/regfree.c,v 1.8 2007/06/11 03:05:54 delphij Exp $ * $DragonFly: src/lib/libc/regex/regfree.c,v 1.3 2004/02/06 22:36:50 joerg Exp $ - * - * @(#)regfree.c 8.3 (Berkeley) 3/20/94 */ #include @@ -47,6 +40,8 @@ #include #include #include +#include +#include #include "utils.h" #include "regex2.h" @@ -59,6 +54,7 @@ void regfree(regex_t *preg) { struct re_guts *g; + int i; if (preg->re_magic != MAGIC1) /* oops */ return; /* nice to complain, but hard */ @@ -71,10 +67,14 @@ regfree(regex_t *preg) if (g->strip != NULL) free((char *)g->strip); - if (g->sets != NULL) + if (g->sets != NULL) { + for (i = 0; i < g->ncsets; i++) { + free(g->sets[i].ranges); + free(g->sets[i].wides); + free(g->sets[i].types); + } free((char *)g->sets); - if (g->setbits != NULL) - free((char *)g->setbits); + } if (g->must != NULL) free(g->must); if (g->charjump != NULL) diff --git a/lib/libc/regex/utils.h b/lib/libc/regex/utils.h index d804d8db11..2a2ed9694d 100644 --- a/lib/libc/regex/utils.h +++ b/lib/libc/regex/utils.h @@ -14,10 +14,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -35,6 +31,7 @@ * SUCH DAMAGE. * * @(#)utils.h 8.3 (Berkeley) 3/20/94 + * $FreeBSD: src/lib/libc/regex/utils.h,v 1.3 2007/01/09 00:28:04 imp Exp $ */ /* utility definitions */