2 * re.c - compile regular expressions.
6 * Copyright (C) 1991-2000 the Free Software Foundation, Inc.
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
28 static reg_syntax_t syn;
30 /* make_regexp --- generate compiled regular expressions */
33 make_regexp(s, len, ignorecase, dfa)
47 /* Handle escaped characters first. */
50 * Build a copy of the string (in dest) with the
51 * escaped characters translated, and generate the regex
54 emalloc(dest, char *, len + 2, "make_regexp");
77 c2 = parse_escape(&src);
81 * Unix awk treats octal (and hex?) chars
82 * literally in re's, so escape regexp
85 if (do_traditional && ! do_posix && (isdigit(c) || c == 'x')
86 && strchr("()|*+?.^$\\[]", c2) != NULL)
91 case '9': /* a\9b not valid */
95 case 'y': /* normally \b */
97 if (! do_traditional) {
103 /* else, fall through */
111 *dest++ = *src++; /* not '\\' */
114 *dest = '\0' ; /* Only necessary if we print dest ? */
115 emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
116 memset((char *) rp, 0, sizeof(*rp));
117 rp->pat.allocated = 0; /* regex will allocate the buffer */
118 emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
121 rp->pat.translate = casetable;
123 rp->pat.translate = NULL;
125 if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
126 fatal("%s: /%s/", rerr, temp);
128 /* gack. this must be done *after* re_compile_pattern */
129 rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
130 if (dfa && ! ignorecase) {
131 dfacomp(temp, len, &(rp->dfareg), TRUE);
140 /* research --- do a regexp search. use dfa if possible */
143 research(rp, str, start, len, need_start)
154 * Always do dfa search if can; if it fails, then even if
155 * need_start is true, we won't bother with the regex search.
162 * dfa likes to stick a '\n' right after the matched
163 * text. So we just save and restore the character.
165 save = str[start+len];
166 ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE,
167 &count, &try_backref);
168 str[start+len] = save;
171 if (need_start || rp->dfa == FALSE || try_backref) {
172 int result = re_search(&(rp->pat), str, start+len,
173 start, len, &(rp->regs));
174 /* recover any space from C based alloca */
185 /* refree --- free up the dynamic memory used by a compiled regexp */
191 free(rp->pat.buffer);
192 free(rp->pat.fastmap);
194 free(rp->regs.start);
198 dfafree(&(rp->dfareg));
202 /* dfaerror --- print an error message for the dfa routines */
211 /* re_update --- recompile a dynamic regexp */
219 if ((t->re_flags & CASE) == IGNORECASE) {
220 if ((t->re_flags & CONST) != 0)
222 t1 = force_string(tree_eval(t->re_exp));
223 if (t->re_text != NULL) {
224 if (cmp_nodes(t->re_text, t1) == 0) {
230 t->re_text = dupnode(t1);
233 if (t->re_reg != NULL)
239 if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
240 t1 = force_string(tree_eval(t->re_exp));
241 t->re_text = dupnode(t1);
244 t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
245 IGNORECASE, t->re_cnt);
246 t->re_flags &= ~CASE;
247 t->re_flags |= IGNORECASE;
251 /* resetup --- choose what kind of regexps we match */
257 syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */
258 else if (do_traditional)
259 syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */
261 syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
264 * Interval expressions are off by default, since it's likely to
265 * break too many old programs to have them on.
270 (void) re_set_syntax(syn);
271 dfasyntax(syn, FALSE);
274 /* avoid_dfa --- FIXME: temporary kludge function until we have a new dfa.c */
277 avoid_dfa(re, str, len)
287 if ((re->re_flags & CONST) != 0) {
288 restr = re->re_exp->stptr;
289 relen = re->re_exp->stlen;
291 restr = re->re_text->stptr;
292 relen = re->re_text->stlen;
295 for (anchor = FALSE, i = 0; i < relen; i++) {
296 if (restr[i] == '^' || restr[i] == '$') {
304 for (end = str + len; str < end; str++)