Merge from vendor branch BINUTILS:
[dragonfly.git] / contrib / awk20040207 / lex.c
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31
32 extern YYSTYPE  yylval;
33 extern int      infunc;
34
35 int     lineno  = 1;
36 int     bracecnt = 0;
37 int     brackcnt  = 0;
38 int     parencnt = 0;
39
40 typedef struct Keyword {
41         const char *word;
42         int     sub;
43         int     type;
44 } Keyword;
45
46 Keyword keywords[] ={   /* keep sorted: binary searched */
47         { "BEGIN",      XBEGIN,         XBEGIN },
48         { "END",        XEND,           XEND },
49         { "NF",         VARNF,          VARNF },
50         { "atan2",      FATAN,          BLTIN },
51         { "break",      BREAK,          BREAK },
52         { "close",      CLOSE,          CLOSE },
53         { "continue",   CONTINUE,       CONTINUE },
54         { "cos",        FCOS,           BLTIN },
55         { "delete",     DELETE,         DELETE },
56         { "do",         DO,             DO },
57         { "else",       ELSE,           ELSE },
58         { "exit",       EXIT,           EXIT },
59         { "exp",        FEXP,           BLTIN },
60         { "fflush",     FFLUSH,         BLTIN },
61         { "for",        FOR,            FOR },
62         { "func",       FUNC,           FUNC },
63         { "function",   FUNC,           FUNC },
64         { "getline",    GETLINE,        GETLINE },
65         { "gsub",       GSUB,           GSUB },
66         { "if",         IF,             IF },
67         { "in",         IN,             IN },
68         { "index",      INDEX,          INDEX },
69         { "int",        FINT,           BLTIN },
70         { "length",     FLENGTH,        BLTIN },
71         { "log",        FLOG,           BLTIN },
72         { "match",      MATCHFCN,       MATCHFCN },
73         { "next",       NEXT,           NEXT },
74         { "nextfile",   NEXTFILE,       NEXTFILE },
75         { "print",      PRINT,          PRINT },
76         { "printf",     PRINTF,         PRINTF },
77         { "rand",       FRAND,          BLTIN },
78         { "return",     RETURN,         RETURN },
79         { "sin",        FSIN,           BLTIN },
80         { "split",      SPLIT,          SPLIT },
81         { "sprintf",    SPRINTF,        SPRINTF },
82         { "sqrt",       FSQRT,          BLTIN },
83         { "srand",      FSRAND,         BLTIN },
84         { "sub",        SUB,            SUB },
85         { "substr",     SUBSTR,         SUBSTR },
86         { "system",     FSYSTEM,        BLTIN },
87         { "tolower",    FTOLOWER,       BLTIN },
88         { "toupper",    FTOUPPER,       BLTIN },
89         { "while",      WHILE,          WHILE },
90 };
91
92 #define DEBUG
93 #ifdef  DEBUG
94 #define RET(x)  { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
95 #else
96 #define RET(x)  return(x)
97 #endif
98
99 int peek(void)
100 {
101         int c = input();
102         unput(c);
103         return c;
104 }
105
106 int gettok(char **pbuf, int *psz)       /* get next input token */
107 {
108         int c, retc;
109         char *buf = *pbuf;
110         int sz = *psz;
111         char *bp = buf;
112
113         c = input();
114         if (c == 0)
115                 return 0;
116         buf[0] = c;
117         buf[1] = 0;
118         if (!isalnum(c) && c != '.' && c != '_')
119                 return c;
120
121         *bp++ = c;
122         if (isalpha(c) || c == '_') {   /* it's a varname */
123                 for ( ; (c = input()) != 0; ) {
124                         if (bp-buf >= sz)
125                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126                                         FATAL( "out of space for name %.10s...", buf );
127                         if (isalnum(c) || c == '_')
128                                 *bp++ = c;
129                         else {
130                                 *bp = 0;
131                                 unput(c);
132                                 break;
133                         }
134                 }
135                 *bp = 0;
136                 retc = 'a';     /* alphanumeric */
137         } else {        /* it's a number */
138                 char *rem;
139                 /* read input until can't be a number */
140                 for ( ; (c = input()) != 0; ) {
141                         if (bp-buf >= sz)
142                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
143                                         FATAL( "out of space for number %.10s...", buf );
144                         if (isdigit(c) || c == 'e' || c == 'E' 
145                           || c == '.' || c == '+' || c == '-')
146                                 *bp++ = c;
147                         else {
148                                 unput(c);
149                                 break;
150                         }
151                 }
152                 *bp = 0;
153                 strtod(buf, &rem);      /* parse the number */
154                 unputstr(rem);          /* put rest back for later */
155                 if (rem == buf) {       /* it wasn't a valid number at all */
156                         buf[1] = 0;     /* so return one character as token */
157                         retc = buf[0];  /* character is its own type */
158                 } else {        /* some prefix was a number */
159                         rem[0] = 0;     /* so truncate where failure started */
160                         retc = '0';     /* number */
161                 }
162         }
163         *pbuf = buf;
164         *psz = sz;
165         return retc;
166 }
167
168 int     word(char *);
169 int     string(void);
170 int     regexpr(void);
171 int     sc      = 0;    /* 1 => return a } right now */
172 int     reg     = 0;    /* 1 => return a REGEXPR now */
173
174 int yylex(void)
175 {
176         int c;
177         static char *buf = 0;
178         static int bufsize = 500;
179
180         if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
181                 FATAL( "out of space in yylex" );
182         if (sc) {
183                 sc = 0;
184                 RET('}');
185         }
186         if (reg) {
187                 reg = 0;
188                 return regexpr();
189         }
190         for (;;) {
191                 c = gettok(&buf, &bufsize);
192                 if (c == 0)
193                         return 0;
194                 if (isalpha(c) || c == '_')
195                         return word(buf);
196                 if (isdigit(c)) {
197                         yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
198                         /* should this also have STR set? */
199                         RET(NUMBER);
200                 }
201         
202                 yylval.i = c;
203                 switch (c) {
204                 case '\n':      /* {EOL} */
205                         RET(NL);
206                 case '\r':      /* assume \n is coming */
207                 case ' ':       /* {WS}+ */
208                 case '\t':
209                         break;
210                 case '#':       /* #.* strip comments */
211                         while ((c = input()) != '\n' && c != 0)
212                                 ;
213                         unput(c);
214                         break;
215                 case ';':
216                         RET(';');
217                 case '\\':
218                         if (peek() == '\n') {
219                                 input();
220                         } else if (peek() == '\r') {
221                                 input(); input();       /* \n */
222                                 lineno++;
223                         } else {
224                                 RET(c);
225                         }
226                         break;
227                 case '&':
228                         if (peek() == '&') {
229                                 input(); RET(AND);
230                         } else 
231                                 RET('&');
232                 case '|':
233                         if (peek() == '|') {
234                                 input(); RET(BOR);
235                         } else
236                                 RET('|');
237                 case '!':
238                         if (peek() == '=') {
239                                 input(); yylval.i = NE; RET(NE);
240                         } else if (peek() == '~') {
241                                 input(); yylval.i = NOTMATCH; RET(MATCHOP);
242                         } else
243                                 RET(NOT);
244                 case '~':
245                         yylval.i = MATCH;
246                         RET(MATCHOP);
247                 case '<':
248                         if (peek() == '=') {
249                                 input(); yylval.i = LE; RET(LE);
250                         } else {
251                                 yylval.i = LT; RET(LT);
252                         }
253                 case '=':
254                         if (peek() == '=') {
255                                 input(); yylval.i = EQ; RET(EQ);
256                         } else {
257                                 yylval.i = ASSIGN; RET(ASGNOP);
258                         }
259                 case '>':
260                         if (peek() == '=') {
261                                 input(); yylval.i = GE; RET(GE);
262                         } else if (peek() == '>') {
263                                 input(); yylval.i = APPEND; RET(APPEND);
264                         } else {
265                                 yylval.i = GT; RET(GT);
266                         }
267                 case '+':
268                         if (peek() == '+') {
269                                 input(); yylval.i = INCR; RET(INCR);
270                         } else if (peek() == '=') {
271                                 input(); yylval.i = ADDEQ; RET(ASGNOP);
272                         } else
273                                 RET('+');
274                 case '-':
275                         if (peek() == '-') {
276                                 input(); yylval.i = DECR; RET(DECR);
277                         } else if (peek() == '=') {
278                                 input(); yylval.i = SUBEQ; RET(ASGNOP);
279                         } else
280                                 RET('-');
281                 case '*':
282                         if (peek() == '=') {    /* *= */
283                                 input(); yylval.i = MULTEQ; RET(ASGNOP);
284                         } else if (peek() == '*') {     /* ** or **= */
285                                 input();        /* eat 2nd * */
286                                 if (peek() == '=') {
287                                         input(); yylval.i = POWEQ; RET(ASGNOP);
288                                 } else {
289                                         RET(POWER);
290                                 }
291                         } else
292                                 RET('*');
293                 case '/':
294                         RET('/');
295                 case '%':
296                         if (peek() == '=') {
297                                 input(); yylval.i = MODEQ; RET(ASGNOP);
298                         } else
299                                 RET('%');
300                 case '^':
301                         if (peek() == '=') {
302                                 input(); yylval.i = POWEQ; RET(ASGNOP);
303                         } else
304                                 RET(POWER);
305
306                 case '$':
307                         /* BUG: awkward, if not wrong */
308                         c = gettok(&buf, &bufsize);
309                         if (isalpha(c)) {
310                                 if (strcmp(buf, "NF") == 0) {   /* very special */
311                                         unputstr("(NF)");
312                                         RET(INDIRECT);
313                                 }
314                                 c = peek();
315                                 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
316                                         unputstr(buf);
317                                         RET(INDIRECT);
318                                 }
319                                 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
320                                 RET(IVAR);
321                         } else if (c == 0) {    /*  */
322                                 SYNTAX( "unexpected end of input after $" );
323                                 RET(';');
324                         } else {
325                                 unputstr(buf);
326                                 RET(INDIRECT);
327                         }
328         
329                 case '}':
330                         if (--bracecnt < 0)
331                                 SYNTAX( "extra }" );
332                         sc = 1;
333                         RET(';');
334                 case ']':
335                         if (--brackcnt < 0)
336                                 SYNTAX( "extra ]" );
337                         RET(']');
338                 case ')':
339                         if (--parencnt < 0)
340                                 SYNTAX( "extra )" );
341                         RET(')');
342                 case '{':
343                         bracecnt++;
344                         RET('{');
345                 case '[':
346                         brackcnt++;
347                         RET('[');
348                 case '(':
349                         parencnt++;
350                         RET('(');
351         
352                 case '"':
353                         return string();        /* BUG: should be like tran.c ? */
354         
355                 default:
356                         RET(c);
357                 }
358         }
359 }
360
361 int string(void)
362 {
363         int c, n;
364         char *s, *bp;
365         static char *buf = 0;
366         static int bufsz = 500;
367
368         if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
369                 FATAL("out of space for strings");
370         for (bp = buf; (c = input()) != '"'; ) {
371                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
372                         FATAL("out of space for string %.10s...", buf);
373                 switch (c) {
374                 case '\n':
375                 case '\r':
376                 case 0:
377                         SYNTAX( "non-terminated string %.10s...", buf );
378                         lineno++;
379                         if (c == 0)     /* hopeless */
380                                 FATAL( "giving up" );
381                         break;
382                 case '\\':
383                         c = input();
384                         switch (c) {
385                         case '"': *bp++ = '"'; break;
386                         case 'n': *bp++ = '\n'; break;  
387                         case 't': *bp++ = '\t'; break;
388                         case 'f': *bp++ = '\f'; break;
389                         case 'r': *bp++ = '\r'; break;
390                         case 'b': *bp++ = '\b'; break;
391                         case 'v': *bp++ = '\v'; break;
392                         case 'a': *bp++ = '\007'; break;
393                         case '\\': *bp++ = '\\'; break;
394
395                         case '0': case '1': case '2': /* octal: \d \dd \ddd */
396                         case '3': case '4': case '5': case '6': case '7':
397                                 n = c - '0';
398                                 if ((c = peek()) >= '0' && c < '8') {
399                                         n = 8 * n + input() - '0';
400                                         if ((c = peek()) >= '0' && c < '8')
401                                                 n = 8 * n + input() - '0';
402                                 }
403                                 *bp++ = n;
404                                 break;
405
406                         case 'x':       /* hex  \x0-9a-fA-F + */
407                             {   char xbuf[100], *px;
408                                 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
409                                         if (isdigit(c)
410                                          || (c >= 'a' && c <= 'f')
411                                          || (c >= 'A' && c <= 'F'))
412                                                 *px++ = c;
413                                         else
414                                                 break;
415                                 }
416                                 *px = 0;
417                                 unput(c);
418                                 sscanf(xbuf, "%x", &n);
419                                 *bp++ = n;
420                                 break;
421                             }
422
423                         default: 
424                                 *bp++ = c;
425                                 break;
426                         }
427                         break;
428                 default:
429                         *bp++ = c;
430                         break;
431                 }
432         }
433         *bp = 0; 
434         s = tostring(buf);
435         *bp++ = ' '; *bp++ = 0;
436         yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
437         RET(STRING);
438 }
439
440
441 int binsearch(char *w, Keyword *kp, int n)
442 {
443         int cond, low, mid, high;
444
445         low = 0;
446         high = n - 1;
447         while (low <= high) {
448                 mid = (low + high) / 2;
449                 if ((cond = strcmp(w, kp[mid].word)) < 0)
450                         high = mid - 1;
451                 else if (cond > 0)
452                         low = mid + 1;
453                 else
454                         return mid;
455         }
456         return -1;
457 }
458
459 int word(char *w) 
460 {
461         Keyword *kp;
462         int c, n;
463
464         n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
465         kp = keywords + n;
466         if (n != -1) {  /* found in table */
467                 yylval.i = kp->sub;
468                 switch (kp->type) {     /* special handling */
469                 case FSYSTEM:
470                         if (safe)
471                                 SYNTAX( "system is unsafe" );
472                         RET(kp->type);
473                 case FUNC:
474                         if (infunc)
475                                 SYNTAX( "illegal nested function" );
476                         RET(kp->type);
477                 case RETURN:
478                         if (!infunc)
479                                 SYNTAX( "return not in function" );
480                         RET(kp->type);
481                 case VARNF:
482                         yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
483                         RET(VARNF);
484                 default:
485                         RET(kp->type);
486                 }
487         }
488         c = peek();     /* look for '(' */
489         if (c != '(' && infunc && (n=isarg(w)) >= 0) {
490                 yylval.i = n;
491                 RET(ARG);
492         } else {
493                 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
494                 if (c == '(') {
495                         RET(CALL);
496                 } else {
497                         RET(VAR);
498                 }
499         }
500 }
501
502 void startreg(void)     /* next call to yylex will return a regular expression */
503 {
504         reg = 1;
505 }
506
507 int regexpr(void)
508 {
509         int c;
510         static char *buf = 0;
511         static int bufsz = 500;
512         char *bp;
513
514         if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
515                 FATAL("out of space for rex expr");
516         bp = buf;
517         for ( ; (c = input()) != '/' && c != 0; ) {
518                 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
519                         FATAL("out of space for reg expr %.10s...", buf);
520                 if (c == '\n') {
521                         SYNTAX( "newline in regular expression %.10s...", buf ); 
522                         unput('\n');
523                         break;
524                 } else if (c == '\\') {
525                         *bp++ = '\\'; 
526                         *bp++ = input();
527                 } else {
528                         *bp++ = c;
529                 }
530         }
531         *bp = 0;
532         if (c == 0)
533                 SYNTAX("non-terminated regular expression %.10s...", buf);
534         yylval.s = tostring(buf);
535         unput('/');
536         RET(REGEXPR);
537 }
538
539 /* low-level lexical stuff, sort of inherited from lex */
540
541 char    ebuf[300];
542 char    *ep = ebuf;
543 char    yysbuf[100];    /* pushback buffer */
544 char    *yysptr = yysbuf;
545 FILE    *yyin = 0;
546
547 int input(void) /* get next lexical input character */
548 {
549         int c;
550         extern char *lexprog;
551
552         if (yysptr > yysbuf)
553                 c = (uschar)*--yysptr;
554         else if (lexprog != NULL) {     /* awk '...' */
555                 if ((c = (uschar)*lexprog) != 0)
556                         lexprog++;
557         } else                          /* awk -f ... */
558                 c = pgetc();
559         if (c == '\n')
560                 lineno++;
561         else if (c == EOF)
562                 c = 0;
563         if (ep >= ebuf + sizeof ebuf)
564                 ep = ebuf;
565         return *ep++ = c;
566 }
567
568 void unput(int c)       /* put lexical character back on input */
569 {
570         if (c == '\n')
571                 lineno--;
572         if (yysptr >= yysbuf + sizeof(yysbuf))
573                 FATAL("pushed back too much: %.20s...", yysbuf);
574         *yysptr++ = c;
575         if (--ep < ebuf)
576                 ep = ebuf + sizeof(ebuf) - 1;
577 }
578
579 void unputstr(const char *s)    /* put a string back on input */
580 {
581         int i;
582
583         for (i = strlen(s)-1; i >= 0; i--)
584                 unput(s[i]);
585 }