Import of awk version 20100523
[dragonfly.git] / contrib / awk20050424 / lex.c
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31
32 extern YYSTYPE  yylval;
33 extern int      infunc;
34
35 int     lineno  = 1;
36 int     bracecnt = 0;
37 int     brackcnt  = 0;
38 int     parencnt = 0;
39
40 typedef struct Keyword {
41         const char *word;
42         int     sub;
43         int     type;
44 } Keyword;
45
46 Keyword keywords[] ={   /* keep sorted: binary searched */
47         { "BEGIN",      XBEGIN,         XBEGIN },
48         { "END",        XEND,           XEND },
49         { "NF",         VARNF,          VARNF },
50         { "atan2",      FATAN,          BLTIN },
51         { "break",      BREAK,          BREAK },
52         { "close",      CLOSE,          CLOSE },
53         { "continue",   CONTINUE,       CONTINUE },
54         { "cos",        FCOS,           BLTIN },
55         { "delete",     DELETE,         DELETE },
56         { "do",         DO,             DO },
57         { "else",       ELSE,           ELSE },
58         { "exit",       EXIT,           EXIT },
59         { "exp",        FEXP,           BLTIN },
60         { "fflush",     FFLUSH,         BLTIN },
61         { "for",        FOR,            FOR },
62         { "func",       FUNC,           FUNC },
63         { "function",   FUNC,           FUNC },
64         { "getline",    GETLINE,        GETLINE },
65         { "gsub",       GSUB,           GSUB },
66         { "if",         IF,             IF },
67         { "in",         IN,             IN },
68         { "index",      INDEX,          INDEX },
69         { "int",        FINT,           BLTIN },
70         { "length",     FLENGTH,        BLTIN },
71         { "log",        FLOG,           BLTIN },
72         { "match",      MATCHFCN,       MATCHFCN },
73         { "next",       NEXT,           NEXT },
74         { "nextfile",   NEXTFILE,       NEXTFILE },
75         { "print",      PRINT,          PRINT },
76         { "printf",     PRINTF,         PRINTF },
77         { "rand",       FRAND,          BLTIN },
78         { "return",     RETURN,         RETURN },
79         { "sin",        FSIN,           BLTIN },
80         { "split",      SPLIT,          SPLIT },
81         { "sprintf",    SPRINTF,        SPRINTF },
82         { "sqrt",       FSQRT,          BLTIN },
83         { "srand",      FSRAND,         BLTIN },
84         { "sub",        SUB,            SUB },
85         { "substr",     SUBSTR,         SUBSTR },
86         { "system",     FSYSTEM,        BLTIN },
87         { "tolower",    FTOLOWER,       BLTIN },
88         { "toupper",    FTOUPPER,       BLTIN },
89         { "while",      WHILE,          WHILE },
90 };
91
92 #define DEBUG
93 #ifdef  DEBUG
94 #define RET(x)  { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
95 #else
96 #define RET(x)  return(x)
97 #endif
98
99 int peek(void)
100 {
101         int c = input();
102         unput(c);
103         return c;
104 }
105
106 int gettok(char **pbuf, int *psz)       /* get next input token */
107 {
108         int c, retc;
109         char *buf = *pbuf;
110         int sz = *psz;
111         char *bp = buf;
112
113         c = input();
114         if (c == 0)
115                 return 0;
116         buf[0] = c;
117         buf[1] = 0;
118         if (!isalnum(c) && c != '.' && c != '_')
119                 return c;
120
121         *bp++ = c;
122         if (isalpha(c) || c == '_') {   /* it's a varname */
123                 for ( ; (c = input()) != 0; ) {
124                         if (bp-buf >= sz)
125                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126                                         FATAL( "out of space for name %.10s...", buf );
127                         if (isalnum(c) || c == '_')
128                                 *bp++ = c;
129                         else {
130                                 *bp = 0;
131                                 unput(c);
132                                 break;
133                         }
134                 }
135                 *bp = 0;
136                 retc = 'a';     /* alphanumeric */
137         } else {        /* maybe it's a number, but could be . */
138                 char *rem;
139                 /* read input until can't be a number */
140                 for ( ; (c = input()) != 0; ) {
141                         if (bp-buf >= sz)
142                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
143                                         FATAL( "out of space for number %.10s...", buf );
144                         if (isdigit(c) || c == 'e' || c == 'E' 
145                           || c == '.' || c == '+' || c == '-')
146                                 *bp++ = c;
147                         else {
148                                 unput(c);
149                                 break;
150                         }
151                 }
152                 *bp = 0;
153                 strtod(buf, &rem);      /* parse the number */
154                 if (rem == buf) {       /* it wasn't a valid number at all */
155                         buf[1] = 0;     /* return one character as token */
156                         retc = buf[0];  /* character is its own type */
157                         unputstr(rem+1); /* put rest back for later */
158                 } else {        /* some prefix was a number */
159                         unputstr(rem);  /* put rest back for later */
160                         rem[0] = 0;     /* truncate buf after number part */
161                         retc = '0';     /* type is number */
162                 }
163         }
164         *pbuf = buf;
165         *psz = sz;
166         return retc;
167 }
168
169 int     word(char *);
170 int     string(void);
171 int     regexpr(void);
172 int     sc      = 0;    /* 1 => return a } right now */
173 int     reg     = 0;    /* 1 => return a REGEXPR now */
174
175 int yylex(void)
176 {
177         int c;
178         static char *buf = 0;
179         static int bufsize = 500;
180
181         if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
182                 FATAL( "out of space in yylex" );
183         if (sc) {
184                 sc = 0;
185                 RET('}');
186         }
187         if (reg) {
188                 reg = 0;
189                 return regexpr();
190         }
191 /* printf("top\n"); */
192         for (;;) {
193                 c = gettok(&buf, &bufsize);
194 /* printf("gettok [%s]\n", buf); */
195                 if (c == 0)
196                         return 0;
197                 if (isalpha(c) || c == '_')
198                         return word(buf);
199                 if (isdigit(c)) {
200                         yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
201                         /* should this also have STR set? */
202                         RET(NUMBER);
203                 }
204         
205                 yylval.i = c;
206                 switch (c) {
207                 case '\n':      /* {EOL} */
208                         RET(NL);
209                 case '\r':      /* assume \n is coming */
210                 case ' ':       /* {WS}+ */
211                 case '\t':
212                         break;
213                 case '#':       /* #.* strip comments */
214                         while ((c = input()) != '\n' && c != 0)
215                                 ;
216                         unput(c);
217                         break;
218                 case ';':
219                         RET(';');
220                 case '\\':
221                         if (peek() == '\n') {
222                                 input();
223                         } else if (peek() == '\r') {
224                                 input(); input();       /* \n */
225                                 lineno++;
226                         } else {
227                                 RET(c);
228                         }
229                         break;
230                 case '&':
231                         if (peek() == '&') {
232                                 input(); RET(AND);
233                         } else 
234                                 RET('&');
235                 case '|':
236                         if (peek() == '|') {
237                                 input(); RET(BOR);
238                         } else
239                                 RET('|');
240                 case '!':
241                         if (peek() == '=') {
242                                 input(); yylval.i = NE; RET(NE);
243                         } else if (peek() == '~') {
244                                 input(); yylval.i = NOTMATCH; RET(MATCHOP);
245                         } else
246                                 RET(NOT);
247                 case '~':
248                         yylval.i = MATCH;
249                         RET(MATCHOP);
250                 case '<':
251                         if (peek() == '=') {
252                                 input(); yylval.i = LE; RET(LE);
253                         } else {
254                                 yylval.i = LT; RET(LT);
255                         }
256                 case '=':
257                         if (peek() == '=') {
258                                 input(); yylval.i = EQ; RET(EQ);
259                         } else {
260                                 yylval.i = ASSIGN; RET(ASGNOP);
261                         }
262                 case '>':
263                         if (peek() == '=') {
264                                 input(); yylval.i = GE; RET(GE);
265                         } else if (peek() == '>') {
266                                 input(); yylval.i = APPEND; RET(APPEND);
267                         } else {
268                                 yylval.i = GT; RET(GT);
269                         }
270                 case '+':
271                         if (peek() == '+') {
272                                 input(); yylval.i = INCR; RET(INCR);
273                         } else if (peek() == '=') {
274                                 input(); yylval.i = ADDEQ; RET(ASGNOP);
275                         } else
276                                 RET('+');
277                 case '-':
278                         if (peek() == '-') {
279                                 input(); yylval.i = DECR; RET(DECR);
280                         } else if (peek() == '=') {
281                                 input(); yylval.i = SUBEQ; RET(ASGNOP);
282                         } else
283                                 RET('-');
284                 case '*':
285                         if (peek() == '=') {    /* *= */
286                                 input(); yylval.i = MULTEQ; RET(ASGNOP);
287                         } else if (peek() == '*') {     /* ** or **= */
288                                 input();        /* eat 2nd * */
289                                 if (peek() == '=') {
290                                         input(); yylval.i = POWEQ; RET(ASGNOP);
291                                 } else {
292                                         RET(POWER);
293                                 }
294                         } else
295                                 RET('*');
296                 case '/':
297                         RET('/');
298                 case '%':
299                         if (peek() == '=') {
300                                 input(); yylval.i = MODEQ; RET(ASGNOP);
301                         } else
302                                 RET('%');
303                 case '^':
304                         if (peek() == '=') {
305                                 input(); yylval.i = POWEQ; RET(ASGNOP);
306                         } else
307                                 RET(POWER);
308
309                 case '$':
310                         /* BUG: awkward, if not wrong */
311                         c = gettok(&buf, &bufsize);
312                         if (isalpha(c)) {
313                                 if (strcmp(buf, "NF") == 0) {   /* very special */
314                                         unputstr("(NF)");
315                                         RET(INDIRECT);
316                                 }
317                                 c = peek();
318                                 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
319                                         unputstr(buf);
320                                         RET(INDIRECT);
321                                 }
322                                 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
323                                 RET(IVAR);
324                         } else if (c == 0) {    /*  */
325                                 SYNTAX( "unexpected end of input after $" );
326                                 RET(';');
327                         } else {
328                                 unputstr(buf);
329                                 RET(INDIRECT);
330                         }
331         
332                 case '}':
333                         if (--bracecnt < 0)
334                                 SYNTAX( "extra }" );
335                         sc = 1;
336                         RET(';');
337                 case ']':
338                         if (--brackcnt < 0)
339                                 SYNTAX( "extra ]" );
340                         RET(']');
341                 case ')':
342                         if (--parencnt < 0)
343                                 SYNTAX( "extra )" );
344                         RET(')');
345                 case '{':
346                         bracecnt++;
347                         RET('{');
348                 case '[':
349                         brackcnt++;
350                         RET('[');
351                 case '(':
352                         parencnt++;
353                         RET('(');
354         
355                 case '"':
356                         return string();        /* BUG: should be like tran.c ? */
357         
358                 default:
359                         RET(c);
360                 }
361         }
362 }
363
364 int string(void)
365 {
366         int c, n;
367         char *s, *bp;
368         static char *buf = 0;
369         static int bufsz = 500;
370
371         if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
372                 FATAL("out of space for strings");
373         for (bp = buf; (c = input()) != '"'; ) {
374                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
375                         FATAL("out of space for string %.10s...", buf);
376                 switch (c) {
377                 case '\n':
378                 case '\r':
379                 case 0:
380                         SYNTAX( "non-terminated string %.10s...", buf );
381                         lineno++;
382                         if (c == 0)     /* hopeless */
383                                 FATAL( "giving up" );
384                         break;
385                 case '\\':
386                         c = input();
387                         switch (c) {
388                         case '"': *bp++ = '"'; break;
389                         case 'n': *bp++ = '\n'; break;  
390                         case 't': *bp++ = '\t'; break;
391                         case 'f': *bp++ = '\f'; break;
392                         case 'r': *bp++ = '\r'; break;
393                         case 'b': *bp++ = '\b'; break;
394                         case 'v': *bp++ = '\v'; break;
395                         case 'a': *bp++ = '\007'; break;
396                         case '\\': *bp++ = '\\'; break;
397
398                         case '0': case '1': case '2': /* octal: \d \dd \ddd */
399                         case '3': case '4': case '5': case '6': case '7':
400                                 n = c - '0';
401                                 if ((c = peek()) >= '0' && c < '8') {
402                                         n = 8 * n + input() - '0';
403                                         if ((c = peek()) >= '0' && c < '8')
404                                                 n = 8 * n + input() - '0';
405                                 }
406                                 *bp++ = n;
407                                 break;
408
409                         case 'x':       /* hex  \x0-9a-fA-F + */
410                             {   char xbuf[100], *px;
411                                 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
412                                         if (isdigit(c)
413                                          || (c >= 'a' && c <= 'f')
414                                          || (c >= 'A' && c <= 'F'))
415                                                 *px++ = c;
416                                         else
417                                                 break;
418                                 }
419                                 *px = 0;
420                                 unput(c);
421                                 sscanf(xbuf, "%x", &n);
422                                 *bp++ = n;
423                                 break;
424                             }
425
426                         default: 
427                                 *bp++ = c;
428                                 break;
429                         }
430                         break;
431                 default:
432                         *bp++ = c;
433                         break;
434                 }
435         }
436         *bp = 0; 
437         s = tostring(buf);
438         *bp++ = ' '; *bp++ = 0;
439         yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
440         RET(STRING);
441 }
442
443
444 int binsearch(char *w, Keyword *kp, int n)
445 {
446         int cond, low, mid, high;
447
448         low = 0;
449         high = n - 1;
450         while (low <= high) {
451                 mid = (low + high) / 2;
452                 if ((cond = strcmp(w, kp[mid].word)) < 0)
453                         high = mid - 1;
454                 else if (cond > 0)
455                         low = mid + 1;
456                 else
457                         return mid;
458         }
459         return -1;
460 }
461
462 int word(char *w) 
463 {
464         Keyword *kp;
465         int c, n;
466
467         n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
468         kp = keywords + n;
469         if (n != -1) {  /* found in table */
470                 yylval.i = kp->sub;
471                 switch (kp->type) {     /* special handling */
472                 case FSYSTEM:
473                         if (safe)
474                                 SYNTAX( "system is unsafe" );
475                         RET(kp->type);
476                 case FUNC:
477                         if (infunc)
478                                 SYNTAX( "illegal nested function" );
479                         RET(kp->type);
480                 case RETURN:
481                         if (!infunc)
482                                 SYNTAX( "return not in function" );
483                         RET(kp->type);
484                 case VARNF:
485                         yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
486                         RET(VARNF);
487                 default:
488                         RET(kp->type);
489                 }
490         }
491         c = peek();     /* look for '(' */
492         if (c != '(' && infunc && (n=isarg(w)) >= 0) {
493                 yylval.i = n;
494                 RET(ARG);
495         } else {
496                 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
497                 if (c == '(') {
498                         RET(CALL);
499                 } else {
500                         RET(VAR);
501                 }
502         }
503 }
504
505 void startreg(void)     /* next call to yylex will return a regular expression */
506 {
507         reg = 1;
508 }
509
510 int regexpr(void)
511 {
512         int c;
513         static char *buf = 0;
514         static int bufsz = 500;
515         char *bp;
516
517         if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
518                 FATAL("out of space for rex expr");
519         bp = buf;
520         for ( ; (c = input()) != '/' && c != 0; ) {
521                 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
522                         FATAL("out of space for reg expr %.10s...", buf);
523                 if (c == '\n') {
524                         SYNTAX( "newline in regular expression %.10s...", buf ); 
525                         unput('\n');
526                         break;
527                 } else if (c == '\\') {
528                         *bp++ = '\\'; 
529                         *bp++ = input();
530                 } else {
531                         *bp++ = c;
532                 }
533         }
534         *bp = 0;
535         if (c == 0)
536                 SYNTAX("non-terminated regular expression %.10s...", buf);
537         yylval.s = tostring(buf);
538         unput('/');
539         RET(REGEXPR);
540 }
541
542 /* low-level lexical stuff, sort of inherited from lex */
543
544 char    ebuf[300];
545 char    *ep = ebuf;
546 char    yysbuf[100];    /* pushback buffer */
547 char    *yysptr = yysbuf;
548 FILE    *yyin = 0;
549
550 int input(void) /* get next lexical input character */
551 {
552         int c;
553         extern char *lexprog;
554
555         if (yysptr > yysbuf)
556                 c = (uschar)*--yysptr;
557         else if (lexprog != NULL) {     /* awk '...' */
558                 if ((c = (uschar)*lexprog) != 0)
559                         lexprog++;
560         } else                          /* awk -f ... */
561                 c = pgetc();
562         if (c == '\n')
563                 lineno++;
564         else if (c == EOF)
565                 c = 0;
566         if (ep >= ebuf + sizeof ebuf)
567                 ep = ebuf;
568         return *ep++ = c;
569 }
570
571 void unput(int c)       /* put lexical character back on input */
572 {
573         if (c == '\n')
574                 lineno--;
575         if (yysptr >= yysbuf + sizeof(yysbuf))
576                 FATAL("pushed back too much: %.20s...", yysbuf);
577         *yysptr++ = c;
578         if (--ep < ebuf)
579                 ep = ebuf + sizeof(ebuf) - 1;
580 }
581
582 void unputstr(const char *s)    /* put a string back on input */
583 {
584         int i;
585
586         for (i = strlen(s)-1; i >= 0; i--)
587                 unput(s[i]);
588 }