vendor/awk: upgrade from 20121220 to 20200612
[dragonfly.git] / contrib / awk / lex.c
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31
32 extern YYSTYPE  yylval;
33 extern bool     infunc;
34
35 int     lineno  = 1;
36 int     bracecnt = 0;
37 int     brackcnt  = 0;
38 int     parencnt = 0;
39
40 typedef struct Keyword {
41         const char *word;
42         int     sub;
43         int     type;
44 } Keyword;
45
46 const Keyword keywords[] = {    /* keep sorted: binary searched */
47         { "BEGIN",      XBEGIN,         XBEGIN },
48         { "END",        XEND,           XEND },
49         { "NF",         VARNF,          VARNF },
50         { "atan2",      FATAN,          BLTIN },
51         { "break",      BREAK,          BREAK },
52         { "close",      CLOSE,          CLOSE },
53         { "continue",   CONTINUE,       CONTINUE },
54         { "cos",        FCOS,           BLTIN },
55         { "delete",     DELETE,         DELETE },
56         { "do",         DO,             DO },
57         { "else",       ELSE,           ELSE },
58         { "exit",       EXIT,           EXIT },
59         { "exp",        FEXP,           BLTIN },
60         { "fflush",     FFLUSH,         BLTIN },
61         { "for",        FOR,            FOR },
62         { "func",       FUNC,           FUNC },
63         { "function",   FUNC,           FUNC },
64         { "getline",    GETLINE,        GETLINE },
65         { "gsub",       GSUB,           GSUB },
66         { "if",         IF,             IF },
67         { "in",         IN,             IN },
68         { "index",      INDEX,          INDEX },
69         { "int",        FINT,           BLTIN },
70         { "length",     FLENGTH,        BLTIN },
71         { "log",        FLOG,           BLTIN },
72         { "match",      MATCHFCN,       MATCHFCN },
73         { "next",       NEXT,           NEXT },
74         { "nextfile",   NEXTFILE,       NEXTFILE },
75         { "print",      PRINT,          PRINT },
76         { "printf",     PRINTF,         PRINTF },
77         { "rand",       FRAND,          BLTIN },
78         { "return",     RETURN,         RETURN },
79         { "sin",        FSIN,           BLTIN },
80         { "split",      SPLIT,          SPLIT },
81         { "sprintf",    SPRINTF,        SPRINTF },
82         { "sqrt",       FSQRT,          BLTIN },
83         { "srand",      FSRAND,         BLTIN },
84         { "sub",        SUB,            SUB },
85         { "substr",     SUBSTR,         SUBSTR },
86         { "system",     FSYSTEM,        BLTIN },
87         { "tolower",    FTOLOWER,       BLTIN },
88         { "toupper",    FTOUPPER,       BLTIN },
89         { "while",      WHILE,          WHILE },
90 };
91
92 #define RET(x)  { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
93
94 static int peek(void)
95 {
96         int c = input();
97         unput(c);
98         return c;
99 }
100
101 static int gettok(char **pbuf, int *psz)        /* get next input token */
102 {
103         int c, retc;
104         char *buf = *pbuf;
105         int sz = *psz;
106         char *bp = buf;
107
108         c = input();
109         if (c == 0)
110                 return 0;
111         buf[0] = c;
112         buf[1] = 0;
113         if (!isalnum(c) && c != '.' && c != '_')
114                 return c;
115
116         *bp++ = c;
117         if (isalpha(c) || c == '_') {   /* it's a varname */
118                 for ( ; (c = input()) != 0; ) {
119                         if (bp-buf >= sz)
120                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121                                         FATAL( "out of space for name %.10s...", buf );
122                         if (isalnum(c) || c == '_')
123                                 *bp++ = c;
124                         else {
125                                 *bp = 0;
126                                 unput(c);
127                                 break;
128                         }
129                 }
130                 *bp = 0;
131                 retc = 'a';     /* alphanumeric */
132         } else {        /* maybe it's a number, but could be . */
133                 char *rem;
134                 /* read input until can't be a number */
135                 for ( ; (c = input()) != 0; ) {
136                         if (bp-buf >= sz)
137                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138                                         FATAL( "out of space for number %.10s...", buf );
139                         if (isdigit(c) || c == 'e' || c == 'E'
140                           || c == '.' || c == '+' || c == '-')
141                                 *bp++ = c;
142                         else {
143                                 unput(c);
144                                 break;
145                         }
146                 }
147                 *bp = 0;
148                 strtod(buf, &rem);      /* parse the number */
149                 if (rem == buf) {       /* it wasn't a valid number at all */
150                         buf[1] = 0;     /* return one character as token */
151                         retc = buf[0];  /* character is its own type */
152                         unputstr(rem+1); /* put rest back for later */
153                 } else {        /* some prefix was a number */
154                         unputstr(rem);  /* put rest back for later */
155                         rem[0] = 0;     /* truncate buf after number part */
156                         retc = '0';     /* type is number */
157                 }
158         }
159         *pbuf = buf;
160         *psz = sz;
161         return retc;
162 }
163
164 int     word(char *);
165 int     string(void);
166 int     regexpr(void);
167 bool    sc      = false;        /* true => return a } right now */
168 bool    reg     = false;        /* true => return a REGEXPR now */
169
170 int yylex(void)
171 {
172         int c;
173         static char *buf = NULL;
174         static int bufsize = 5; /* BUG: setting this small causes core dump! */
175
176         if (buf == NULL && (buf = malloc(bufsize)) == NULL)
177                 FATAL( "out of space in yylex" );
178         if (sc) {
179                 sc = false;
180                 RET('}');
181         }
182         if (reg) {
183                 reg = false;
184                 return regexpr();
185         }
186         for (;;) {
187                 c = gettok(&buf, &bufsize);
188                 if (c == 0)
189                         return 0;
190                 if (isalpha(c) || c == '_')
191                         return word(buf);
192                 if (isdigit(c)) {
193                         char *cp = tostring(buf);
194                         yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab);
195                         free(cp);
196                         /* should this also have STR set? */
197                         RET(NUMBER);
198                 }
199
200                 yylval.i = c;
201                 switch (c) {
202                 case '\n':      /* {EOL} */
203                         lineno++;
204                         RET(NL);
205                 case '\r':      /* assume \n is coming */
206                 case ' ':       /* {WS}+ */
207                 case '\t':
208                         break;
209                 case '#':       /* #.* strip comments */
210                         while ((c = input()) != '\n' && c != 0)
211                                 ;
212                         unput(c);
213                         /*
214                          * Next line is a hack, itcompensates for
215                          * unput's treatment of \n.
216                          */
217                         lineno++;
218                         break;
219                 case ';':
220                         RET(';');
221                 case '\\':
222                         if (peek() == '\n') {
223                                 input();
224                                 lineno++;
225                         } else if (peek() == '\r') {
226                                 input(); input();       /* \n */
227                                 lineno++;
228                         } else {
229                                 RET(c);
230                         }
231                         break;
232                 case '&':
233                         if (peek() == '&') {
234                                 input(); RET(AND);
235                         } else
236                                 RET('&');
237                 case '|':
238                         if (peek() == '|') {
239                                 input(); RET(BOR);
240                         } else
241                                 RET('|');
242                 case '!':
243                         if (peek() == '=') {
244                                 input(); yylval.i = NE; RET(NE);
245                         } else if (peek() == '~') {
246                                 input(); yylval.i = NOTMATCH; RET(MATCHOP);
247                         } else
248                                 RET(NOT);
249                 case '~':
250                         yylval.i = MATCH;
251                         RET(MATCHOP);
252                 case '<':
253                         if (peek() == '=') {
254                                 input(); yylval.i = LE; RET(LE);
255                         } else {
256                                 yylval.i = LT; RET(LT);
257                         }
258                 case '=':
259                         if (peek() == '=') {
260                                 input(); yylval.i = EQ; RET(EQ);
261                         } else {
262                                 yylval.i = ASSIGN; RET(ASGNOP);
263                         }
264                 case '>':
265                         if (peek() == '=') {
266                                 input(); yylval.i = GE; RET(GE);
267                         } else if (peek() == '>') {
268                                 input(); yylval.i = APPEND; RET(APPEND);
269                         } else {
270                                 yylval.i = GT; RET(GT);
271                         }
272                 case '+':
273                         if (peek() == '+') {
274                                 input(); yylval.i = INCR; RET(INCR);
275                         } else if (peek() == '=') {
276                                 input(); yylval.i = ADDEQ; RET(ASGNOP);
277                         } else
278                                 RET('+');
279                 case '-':
280                         if (peek() == '-') {
281                                 input(); yylval.i = DECR; RET(DECR);
282                         } else if (peek() == '=') {
283                                 input(); yylval.i = SUBEQ; RET(ASGNOP);
284                         } else
285                                 RET('-');
286                 case '*':
287                         if (peek() == '=') {    /* *= */
288                                 input(); yylval.i = MULTEQ; RET(ASGNOP);
289                         } else if (peek() == '*') {     /* ** or **= */
290                                 input();        /* eat 2nd * */
291                                 if (peek() == '=') {
292                                         input(); yylval.i = POWEQ; RET(ASGNOP);
293                                 } else {
294                                         RET(POWER);
295                                 }
296                         } else
297                                 RET('*');
298                 case '/':
299                         RET('/');
300                 case '%':
301                         if (peek() == '=') {
302                                 input(); yylval.i = MODEQ; RET(ASGNOP);
303                         } else
304                                 RET('%');
305                 case '^':
306                         if (peek() == '=') {
307                                 input(); yylval.i = POWEQ; RET(ASGNOP);
308                         } else
309                                 RET(POWER);
310
311                 case '$':
312                         /* BUG: awkward, if not wrong */
313                         c = gettok(&buf, &bufsize);
314                         if (isalpha(c)) {
315                                 if (strcmp(buf, "NF") == 0) {   /* very special */
316                                         unputstr("(NF)");
317                                         RET(INDIRECT);
318                                 }
319                                 c = peek();
320                                 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
321                                         unputstr(buf);
322                                         RET(INDIRECT);
323                                 }
324                                 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
325                                 RET(IVAR);
326                         } else if (c == 0) {    /*  */
327                                 SYNTAX( "unexpected end of input after $" );
328                                 RET(';');
329                         } else {
330                                 unputstr(buf);
331                                 RET(INDIRECT);
332                         }
333
334                 case '}':
335                         if (--bracecnt < 0)
336                                 SYNTAX( "extra }" );
337                         sc = true;
338                         RET(';');
339                 case ']':
340                         if (--brackcnt < 0)
341                                 SYNTAX( "extra ]" );
342                         RET(']');
343                 case ')':
344                         if (--parencnt < 0)
345                                 SYNTAX( "extra )" );
346                         RET(')');
347                 case '{':
348                         bracecnt++;
349                         RET('{');
350                 case '[':
351                         brackcnt++;
352                         RET('[');
353                 case '(':
354                         parencnt++;
355                         RET('(');
356
357                 case '"':
358                         return string();        /* BUG: should be like tran.c ? */
359
360                 default:
361                         RET(c);
362                 }
363         }
364 }
365
366 int string(void)
367 {
368         int c, n;
369         char *s, *bp;
370         static char *buf = NULL;
371         static int bufsz = 500;
372
373         if (buf == NULL && (buf = malloc(bufsz)) == NULL)
374                 FATAL("out of space for strings");
375         for (bp = buf; (c = input()) != '"'; ) {
376                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
377                         FATAL("out of space for string %.10s...", buf);
378                 switch (c) {
379                 case '\n':
380                 case '\r':
381                 case 0:
382                         *bp = '\0';
383                         SYNTAX( "non-terminated string %.10s...", buf );
384                         if (c == 0)     /* hopeless */
385                                 FATAL( "giving up" );
386                         lineno++;
387                         break;
388                 case '\\':
389                         c = input();
390                         switch (c) {
391                         case '\n': break;
392                         case '"': *bp++ = '"'; break;
393                         case 'n': *bp++ = '\n'; break;
394                         case 't': *bp++ = '\t'; break;
395                         case 'f': *bp++ = '\f'; break;
396                         case 'r': *bp++ = '\r'; break;
397                         case 'b': *bp++ = '\b'; break;
398                         case 'v': *bp++ = '\v'; break;
399                         case 'a': *bp++ = '\a'; break;
400                         case '\\': *bp++ = '\\'; break;
401
402                         case '0': case '1': case '2': /* octal: \d \dd \ddd */
403                         case '3': case '4': case '5': case '6': case '7':
404                                 n = c - '0';
405                                 if ((c = peek()) >= '0' && c < '8') {
406                                         n = 8 * n + input() - '0';
407                                         if ((c = peek()) >= '0' && c < '8')
408                                                 n = 8 * n + input() - '0';
409                                 }
410                                 *bp++ = n;
411                                 break;
412
413                         case 'x':       /* hex  \x0-9a-fA-F + */
414                             {   char xbuf[100], *px;
415                                 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
416                                         if (isdigit(c)
417                                          || (c >= 'a' && c <= 'f')
418                                          || (c >= 'A' && c <= 'F'))
419                                                 *px++ = c;
420                                         else
421                                                 break;
422                                 }
423                                 *px = 0;
424                                 unput(c);
425                                 sscanf(xbuf, "%x", (unsigned int *) &n);
426                                 *bp++ = n;
427                                 break;
428                             }
429
430                         default:
431                                 *bp++ = c;
432                                 break;
433                         }
434                         break;
435                 default:
436                         *bp++ = c;
437                         break;
438                 }
439         }
440         *bp = 0;
441         s = tostring(buf);
442         *bp++ = ' '; *bp++ = '\0';
443         yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
444         free(s);
445         RET(STRING);
446 }
447
448
449 static int binsearch(char *w, const Keyword *kp, int n)
450 {
451         int cond, low, mid, high;
452
453         low = 0;
454         high = n - 1;
455         while (low <= high) {
456                 mid = (low + high) / 2;
457                 if ((cond = strcmp(w, kp[mid].word)) < 0)
458                         high = mid - 1;
459                 else if (cond > 0)
460                         low = mid + 1;
461                 else
462                         return mid;
463         }
464         return -1;
465 }
466
467 int word(char *w)
468 {
469         const Keyword *kp;
470         int c, n;
471
472         n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
473         if (n != -1) {  /* found in table */
474                 kp = keywords + n;
475                 yylval.i = kp->sub;
476                 switch (kp->type) {     /* special handling */
477                 case BLTIN:
478                         if (kp->sub == FSYSTEM && safe)
479                                 SYNTAX( "system is unsafe" );
480                         RET(kp->type);
481                 case FUNC:
482                         if (infunc)
483                                 SYNTAX( "illegal nested function" );
484                         RET(kp->type);
485                 case RETURN:
486                         if (!infunc)
487                                 SYNTAX( "return not in function" );
488                         RET(kp->type);
489                 case VARNF:
490                         yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
491                         RET(VARNF);
492                 default:
493                         RET(kp->type);
494                 }
495         }
496         c = peek();     /* look for '(' */
497         if (c != '(' && infunc && (n=isarg(w)) >= 0) {
498                 yylval.i = n;
499                 RET(ARG);
500         } else {
501                 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
502                 if (c == '(') {
503                         RET(CALL);
504                 } else {
505                         RET(VAR);
506                 }
507         }
508 }
509
510 void startreg(void)     /* next call to yylex will return a regular expression */
511 {
512         reg = true;
513 }
514
515 int regexpr(void)
516 {
517         int c;
518         static char *buf = NULL;
519         static int bufsz = 500;
520         char *bp;
521
522         if (buf == NULL && (buf = malloc(bufsz)) == NULL)
523                 FATAL("out of space for rex expr");
524         bp = buf;
525         for ( ; (c = input()) != '/' && c != 0; ) {
526                 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
527                         FATAL("out of space for reg expr %.10s...", buf);
528                 if (c == '\n') {
529                         *bp = '\0';
530                         SYNTAX( "newline in regular expression %.10s...", buf );
531                         unput('\n');
532                         break;
533                 } else if (c == '\\') {
534                         *bp++ = '\\';
535                         *bp++ = input();
536                 } else {
537                         *bp++ = c;
538                 }
539         }
540         *bp = 0;
541         if (c == 0)
542                 SYNTAX("non-terminated regular expression %.10s...", buf);
543         yylval.s = tostring(buf);
544         unput('/');
545         RET(REGEXPR);
546 }
547
548 /* low-level lexical stuff, sort of inherited from lex */
549
550 char    ebuf[300];
551 char    *ep = ebuf;
552 char    yysbuf[100];    /* pushback buffer */
553 char    *yysptr = yysbuf;
554 FILE    *yyin = NULL;
555
556 int input(void) /* get next lexical input character */
557 {
558         int c;
559         extern char *lexprog;
560
561         if (yysptr > yysbuf)
562                 c = (uschar)*--yysptr;
563         else if (lexprog != NULL) {     /* awk '...' */
564                 if ((c = (uschar)*lexprog) != 0)
565                         lexprog++;
566         } else                          /* awk -f ... */
567                 c = pgetc();
568         if (c == EOF)
569                 c = 0;
570         if (ep >= ebuf + sizeof ebuf)
571                 ep = ebuf;
572         *ep = c;
573         if (c != 0) {
574                 ep++;
575         }
576         return (c);
577 }
578
579 void unput(int c)       /* put lexical character back on input */
580 {
581         if (c == '\n')  
582                 lineno--;
583         if (yysptr >= yysbuf + sizeof(yysbuf))
584                 FATAL("pushed back too much: %.20s...", yysbuf);
585         *yysptr++ = c;
586         if (--ep < ebuf)
587                 ep = ebuf + sizeof(ebuf) - 1;
588 }
589
590 void unputstr(const char *s)    /* put a string back on input */
591 {
592         int i;
593
594         for (i = strlen(s)-1; i >= 0; i--)
595                 unput(s[i]);
596 }