Add per-device polling support.
[dragonfly.git] / contrib / awk / re.c
1 /*
2  * re.c - compile regular expressions.
3  */
4
5 /* 
6  * Copyright (C) 1991-2000 the Free Software Foundation, Inc.
7  * 
8  * This file is part of GAWK, the GNU implementation of the
9  * AWK Programming Language.
10  * 
11  * GAWK is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  * 
16  * GAWK is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  * 
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
24  */
25
26 #include "awk.h"
27
28 static reg_syntax_t syn;
29
30 /* make_regexp --- generate compiled regular expressions */
31
32 Regexp *
33 make_regexp(s, len, ignorecase, dfa)
34 char *s;
35 size_t len;
36 int ignorecase;
37 int dfa;
38 {
39         Regexp *rp;
40         const char *rerr;
41         char *src = s;
42         char *temp;
43         char *end = s + len;
44         register char *dest;
45         register int c, c2;
46
47         /* Handle escaped characters first. */
48
49         /*
50          * Build a copy of the string (in dest) with the
51          * escaped characters translated, and generate the regex
52          * from that.  
53          */
54         emalloc(dest, char *, len + 2, "make_regexp");
55         temp = dest;
56
57         while (src < end) {
58                 if (*src == '\\') {
59                         c = *++src;
60                         switch (c) {
61                         case 'a':
62                         case 'b':
63                         case 'f':
64                         case 'n':
65                         case 'r':
66                         case 't':
67                         case 'v':
68                         case 'x':
69                         case '0':
70                         case '1':
71                         case '2':
72                         case '3':
73                         case '4':
74                         case '5':
75                         case '6':
76                         case '7':
77                                 c2 = parse_escape(&src);
78                                 if (c2 < 0)
79                                         cant_happen();
80                                 /*
81                                  * Unix awk treats octal (and hex?) chars
82                                  * literally in re's, so escape regexp
83                                  * metacharacters.
84                                  */
85                                 if (do_traditional && ! do_posix && (isdigit(c) || c == 'x')
86                                     && strchr("()|*+?.^$\\[]", c2) != NULL)
87                                         *dest++ = '\\';
88                                 *dest++ = (char) c2;
89                                 break;
90                         case '8':
91                         case '9':       /* a\9b not valid */
92                                 *dest++ = c;
93                                 src++;
94                                 break;
95                         case 'y':       /* normally \b */
96                                 /* gnu regex op */
97                                 if (! do_traditional) {
98                                         *dest++ = '\\';
99                                         *dest++ = 'b';
100                                         src++;
101                                         break;
102                                 }
103                                 /* else, fall through */
104                         default:
105                                 *dest++ = '\\';
106                                 *dest++ = (char) c;
107                                 src++;
108                                 break;
109                         } /* switch */
110                 } else
111                         *dest++ = *src++;       /* not '\\' */
112         } /* for */
113
114         *dest = '\0' ;  /* Only necessary if we print dest ? */
115         emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
116         memset((char *) rp, 0, sizeof(*rp));
117         rp->pat.allocated = 0;  /* regex will allocate the buffer */
118         emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
119
120         if (ignorecase)
121                 rp->pat.translate = casetable;
122         else
123                 rp->pat.translate = NULL;
124         len = dest - temp;
125         if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
126                 fatal("%s: /%s/", rerr, temp);
127
128         /* gack. this must be done *after* re_compile_pattern */
129         rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
130         if (dfa && ! ignorecase) {
131                 dfacomp(temp, len, &(rp->dfareg), TRUE);
132                 rp->dfa = TRUE;
133         } else
134                 rp->dfa = FALSE;
135
136         free(temp);
137         return rp;
138 }
139
140 /* research --- do a regexp search. use dfa if possible */
141
142 int
143 research(rp, str, start, len, need_start)
144 Regexp *rp;
145 register char *str;
146 int start;
147 register size_t len;
148 int need_start;
149 {
150         char *ret = str;
151         int try_backref;
152
153         /*
154          * Always do dfa search if can; if it fails, then even if
155          * need_start is true, we won't bother with the regex search.
156          */
157         if (rp->dfa) {
158                 char save;
159                 int count = 0;
160
161                 /*
162                  * dfa likes to stick a '\n' right after the matched
163                  * text.  So we just save and restore the character.
164                  */
165                 save = str[start+len];
166                 ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE,
167                                         &count, &try_backref);
168                 str[start+len] = save;
169         }
170         if (ret) {
171                 if (need_start || rp->dfa == FALSE || try_backref) {
172                         int result = re_search(&(rp->pat), str, start+len,
173                                         start, len, &(rp->regs));
174                         /* recover any space from C based alloca */
175 #ifdef C_ALLOCA
176                         (void) alloca(0);
177 #endif
178                         return result;
179                 } else
180                         return 1;
181         } else
182                 return -1;
183 }
184
185 /* refree --- free up the dynamic memory used by a compiled regexp */
186
187 void
188 refree(rp)
189 Regexp *rp;
190 {
191         free(rp->pat.buffer);
192         free(rp->pat.fastmap);
193         if (rp->regs.start)
194                 free(rp->regs.start);
195         if (rp->regs.end)
196                 free(rp->regs.end);
197         if (rp->dfa)
198                 dfafree(&(rp->dfareg));
199         free(rp);
200 }
201
202 /* dfaerror --- print an error message for the dfa routines */
203
204 void
205 dfaerror(s)
206 const char *s;
207 {
208         fatal("%s", s);
209 }
210
211 /* re_update --- recompile a dynamic regexp */
212
213 Regexp *
214 re_update(t)
215 NODE *t;
216 {
217         NODE *t1;
218
219         if ((t->re_flags & CASE) == IGNORECASE) {
220                 if ((t->re_flags & CONST) != 0)
221                         return t->re_reg;
222                 t1 = force_string(tree_eval(t->re_exp));
223                 if (t->re_text != NULL) {
224                         if (cmp_nodes(t->re_text, t1) == 0) {
225                                 free_temp(t1);
226                                 return t->re_reg;
227                         }
228                         unref(t->re_text);
229                 }
230                 t->re_text = dupnode(t1);
231                 free_temp(t1);
232         }
233         if (t->re_reg != NULL)
234                 refree(t->re_reg);
235         if (t->re_cnt > 0)
236                 t->re_cnt++;
237         if (t->re_cnt > 10)
238                 t->re_cnt = 0;
239         if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
240                 t1 = force_string(tree_eval(t->re_exp));
241                 t->re_text = dupnode(t1);
242                 free_temp(t1);
243         }
244         t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
245                                 IGNORECASE, t->re_cnt);
246         t->re_flags &= ~CASE;
247         t->re_flags |= IGNORECASE;
248         return t->re_reg;
249 }
250
251 /* resetup --- choose what kind of regexps we match */
252
253 void
254 resetup()
255 {
256         if (do_posix)
257                 syn = RE_SYNTAX_POSIX_AWK;      /* strict POSIX re's */
258         else if (do_traditional)
259                 syn = RE_SYNTAX_AWK;            /* traditional Unix awk re's */
260         else
261                 syn = RE_SYNTAX_GNU_AWK;        /* POSIX re's + GNU ops */
262
263         /*
264          * Interval expressions are off by default, since it's likely to
265          * break too many old programs to have them on.
266          */
267         if (do_intervals)
268                 syn |= RE_INTERVALS;
269
270         (void) re_set_syntax(syn);
271         dfasyntax(syn, FALSE);
272 }
273
274 /* avoid_dfa --- FIXME: temporary kludge function until we have a new dfa.c */
275
276 int
277 avoid_dfa(re, str, len)
278 NODE *re;
279 char *str;
280 size_t len;
281 {
282         char *restr;
283         int relen;
284         int anchor, i;
285         char *end;
286
287         if ((re->re_flags & CONST) != 0) {
288                 restr = re->re_exp->stptr;
289                 relen = re->re_exp->stlen;
290         } else {
291                 restr = re->re_text->stptr;
292                 relen = re->re_text->stlen;
293         }
294
295         for (anchor = FALSE, i = 0; i < relen; i++) {
296                 if (restr[i] == '^' || restr[i] == '$') {
297                         anchor = TRUE;
298                         break;
299                 }
300         }
301         if (! anchor)
302                 return FALSE;
303
304         for (end = str + len; str < end; str++)
305                 if (*str == '\n')
306                         return TRUE;
307
308         return FALSE;
309 }