Import of awk version 20100523
[dragonfly.git] / contrib / awk20050424 / lex.c
... / ...
CommitLineData
1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <ctype.h>
29#include "awk.h"
30#include "ytab.h"
31
32extern YYSTYPE yylval;
33extern int infunc;
34
35int lineno = 1;
36int bracecnt = 0;
37int brackcnt = 0;
38int parencnt = 0;
39
40typedef struct Keyword {
41 const char *word;
42 int sub;
43 int type;
44} Keyword;
45
46Keyword keywords[] ={ /* keep sorted: binary searched */
47 { "BEGIN", XBEGIN, XBEGIN },
48 { "END", XEND, XEND },
49 { "NF", VARNF, VARNF },
50 { "atan2", FATAN, BLTIN },
51 { "break", BREAK, BREAK },
52 { "close", CLOSE, CLOSE },
53 { "continue", CONTINUE, CONTINUE },
54 { "cos", FCOS, BLTIN },
55 { "delete", DELETE, DELETE },
56 { "do", DO, DO },
57 { "else", ELSE, ELSE },
58 { "exit", EXIT, EXIT },
59 { "exp", FEXP, BLTIN },
60 { "fflush", FFLUSH, BLTIN },
61 { "for", FOR, FOR },
62 { "func", FUNC, FUNC },
63 { "function", FUNC, FUNC },
64 { "getline", GETLINE, GETLINE },
65 { "gsub", GSUB, GSUB },
66 { "if", IF, IF },
67 { "in", IN, IN },
68 { "index", INDEX, INDEX },
69 { "int", FINT, BLTIN },
70 { "length", FLENGTH, BLTIN },
71 { "log", FLOG, BLTIN },
72 { "match", MATCHFCN, MATCHFCN },
73 { "next", NEXT, NEXT },
74 { "nextfile", NEXTFILE, NEXTFILE },
75 { "print", PRINT, PRINT },
76 { "printf", PRINTF, PRINTF },
77 { "rand", FRAND, BLTIN },
78 { "return", RETURN, RETURN },
79 { "sin", FSIN, BLTIN },
80 { "split", SPLIT, SPLIT },
81 { "sprintf", SPRINTF, SPRINTF },
82 { "sqrt", FSQRT, BLTIN },
83 { "srand", FSRAND, BLTIN },
84 { "sub", SUB, SUB },
85 { "substr", SUBSTR, SUBSTR },
86 { "system", FSYSTEM, BLTIN },
87 { "tolower", FTOLOWER, BLTIN },
88 { "toupper", FTOUPPER, BLTIN },
89 { "while", WHILE, WHILE },
90};
91
92#define DEBUG
93#ifdef DEBUG
94#define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
95#else
96#define RET(x) return(x)
97#endif
98
99int peek(void)
100{
101 int c = input();
102 unput(c);
103 return c;
104}
105
106int gettok(char **pbuf, int *psz) /* get next input token */
107{
108 int c, retc;
109 char *buf = *pbuf;
110 int sz = *psz;
111 char *bp = buf;
112
113 c = input();
114 if (c == 0)
115 return 0;
116 buf[0] = c;
117 buf[1] = 0;
118 if (!isalnum(c) && c != '.' && c != '_')
119 return c;
120
121 *bp++ = c;
122 if (isalpha(c) || c == '_') { /* it's a varname */
123 for ( ; (c = input()) != 0; ) {
124 if (bp-buf >= sz)
125 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126 FATAL( "out of space for name %.10s...", buf );
127 if (isalnum(c) || c == '_')
128 *bp++ = c;
129 else {
130 *bp = 0;
131 unput(c);
132 break;
133 }
134 }
135 *bp = 0;
136 retc = 'a'; /* alphanumeric */
137 } else { /* maybe it's a number, but could be . */
138 char *rem;
139 /* read input until can't be a number */
140 for ( ; (c = input()) != 0; ) {
141 if (bp-buf >= sz)
142 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
143 FATAL( "out of space for number %.10s...", buf );
144 if (isdigit(c) || c == 'e' || c == 'E'
145 || c == '.' || c == '+' || c == '-')
146 *bp++ = c;
147 else {
148 unput(c);
149 break;
150 }
151 }
152 *bp = 0;
153 strtod(buf, &rem); /* parse the number */
154 if (rem == buf) { /* it wasn't a valid number at all */
155 buf[1] = 0; /* return one character as token */
156 retc = buf[0]; /* character is its own type */
157 unputstr(rem+1); /* put rest back for later */
158 } else { /* some prefix was a number */
159 unputstr(rem); /* put rest back for later */
160 rem[0] = 0; /* truncate buf after number part */
161 retc = '0'; /* type is number */
162 }
163 }
164 *pbuf = buf;
165 *psz = sz;
166 return retc;
167}
168
169int word(char *);
170int string(void);
171int regexpr(void);
172int sc = 0; /* 1 => return a } right now */
173int reg = 0; /* 1 => return a REGEXPR now */
174
175int yylex(void)
176{
177 int c;
178 static char *buf = 0;
179 static int bufsize = 500;
180
181 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
182 FATAL( "out of space in yylex" );
183 if (sc) {
184 sc = 0;
185 RET('}');
186 }
187 if (reg) {
188 reg = 0;
189 return regexpr();
190 }
191/* printf("top\n"); */
192 for (;;) {
193 c = gettok(&buf, &bufsize);
194/* printf("gettok [%s]\n", buf); */
195 if (c == 0)
196 return 0;
197 if (isalpha(c) || c == '_')
198 return word(buf);
199 if (isdigit(c)) {
200 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
201 /* should this also have STR set? */
202 RET(NUMBER);
203 }
204
205 yylval.i = c;
206 switch (c) {
207 case '\n': /* {EOL} */
208 RET(NL);
209 case '\r': /* assume \n is coming */
210 case ' ': /* {WS}+ */
211 case '\t':
212 break;
213 case '#': /* #.* strip comments */
214 while ((c = input()) != '\n' && c != 0)
215 ;
216 unput(c);
217 break;
218 case ';':
219 RET(';');
220 case '\\':
221 if (peek() == '\n') {
222 input();
223 } else if (peek() == '\r') {
224 input(); input(); /* \n */
225 lineno++;
226 } else {
227 RET(c);
228 }
229 break;
230 case '&':
231 if (peek() == '&') {
232 input(); RET(AND);
233 } else
234 RET('&');
235 case '|':
236 if (peek() == '|') {
237 input(); RET(BOR);
238 } else
239 RET('|');
240 case '!':
241 if (peek() == '=') {
242 input(); yylval.i = NE; RET(NE);
243 } else if (peek() == '~') {
244 input(); yylval.i = NOTMATCH; RET(MATCHOP);
245 } else
246 RET(NOT);
247 case '~':
248 yylval.i = MATCH;
249 RET(MATCHOP);
250 case '<':
251 if (peek() == '=') {
252 input(); yylval.i = LE; RET(LE);
253 } else {
254 yylval.i = LT; RET(LT);
255 }
256 case '=':
257 if (peek() == '=') {
258 input(); yylval.i = EQ; RET(EQ);
259 } else {
260 yylval.i = ASSIGN; RET(ASGNOP);
261 }
262 case '>':
263 if (peek() == '=') {
264 input(); yylval.i = GE; RET(GE);
265 } else if (peek() == '>') {
266 input(); yylval.i = APPEND; RET(APPEND);
267 } else {
268 yylval.i = GT; RET(GT);
269 }
270 case '+':
271 if (peek() == '+') {
272 input(); yylval.i = INCR; RET(INCR);
273 } else if (peek() == '=') {
274 input(); yylval.i = ADDEQ; RET(ASGNOP);
275 } else
276 RET('+');
277 case '-':
278 if (peek() == '-') {
279 input(); yylval.i = DECR; RET(DECR);
280 } else if (peek() == '=') {
281 input(); yylval.i = SUBEQ; RET(ASGNOP);
282 } else
283 RET('-');
284 case '*':
285 if (peek() == '=') { /* *= */
286 input(); yylval.i = MULTEQ; RET(ASGNOP);
287 } else if (peek() == '*') { /* ** or **= */
288 input(); /* eat 2nd * */
289 if (peek() == '=') {
290 input(); yylval.i = POWEQ; RET(ASGNOP);
291 } else {
292 RET(POWER);
293 }
294 } else
295 RET('*');
296 case '/':
297 RET('/');
298 case '%':
299 if (peek() == '=') {
300 input(); yylval.i = MODEQ; RET(ASGNOP);
301 } else
302 RET('%');
303 case '^':
304 if (peek() == '=') {
305 input(); yylval.i = POWEQ; RET(ASGNOP);
306 } else
307 RET(POWER);
308
309 case '$':
310 /* BUG: awkward, if not wrong */
311 c = gettok(&buf, &bufsize);
312 if (isalpha(c)) {
313 if (strcmp(buf, "NF") == 0) { /* very special */
314 unputstr("(NF)");
315 RET(INDIRECT);
316 }
317 c = peek();
318 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
319 unputstr(buf);
320 RET(INDIRECT);
321 }
322 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
323 RET(IVAR);
324 } else if (c == 0) { /* */
325 SYNTAX( "unexpected end of input after $" );
326 RET(';');
327 } else {
328 unputstr(buf);
329 RET(INDIRECT);
330 }
331
332 case '}':
333 if (--bracecnt < 0)
334 SYNTAX( "extra }" );
335 sc = 1;
336 RET(';');
337 case ']':
338 if (--brackcnt < 0)
339 SYNTAX( "extra ]" );
340 RET(']');
341 case ')':
342 if (--parencnt < 0)
343 SYNTAX( "extra )" );
344 RET(')');
345 case '{':
346 bracecnt++;
347 RET('{');
348 case '[':
349 brackcnt++;
350 RET('[');
351 case '(':
352 parencnt++;
353 RET('(');
354
355 case '"':
356 return string(); /* BUG: should be like tran.c ? */
357
358 default:
359 RET(c);
360 }
361 }
362}
363
364int string(void)
365{
366 int c, n;
367 char *s, *bp;
368 static char *buf = 0;
369 static int bufsz = 500;
370
371 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
372 FATAL("out of space for strings");
373 for (bp = buf; (c = input()) != '"'; ) {
374 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
375 FATAL("out of space for string %.10s...", buf);
376 switch (c) {
377 case '\n':
378 case '\r':
379 case 0:
380 SYNTAX( "non-terminated string %.10s...", buf );
381 lineno++;
382 if (c == 0) /* hopeless */
383 FATAL( "giving up" );
384 break;
385 case '\\':
386 c = input();
387 switch (c) {
388 case '"': *bp++ = '"'; break;
389 case 'n': *bp++ = '\n'; break;
390 case 't': *bp++ = '\t'; break;
391 case 'f': *bp++ = '\f'; break;
392 case 'r': *bp++ = '\r'; break;
393 case 'b': *bp++ = '\b'; break;
394 case 'v': *bp++ = '\v'; break;
395 case 'a': *bp++ = '\007'; break;
396 case '\\': *bp++ = '\\'; break;
397
398 case '0': case '1': case '2': /* octal: \d \dd \ddd */
399 case '3': case '4': case '5': case '6': case '7':
400 n = c - '0';
401 if ((c = peek()) >= '0' && c < '8') {
402 n = 8 * n + input() - '0';
403 if ((c = peek()) >= '0' && c < '8')
404 n = 8 * n + input() - '0';
405 }
406 *bp++ = n;
407 break;
408
409 case 'x': /* hex \x0-9a-fA-F + */
410 { char xbuf[100], *px;
411 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
412 if (isdigit(c)
413 || (c >= 'a' && c <= 'f')
414 || (c >= 'A' && c <= 'F'))
415 *px++ = c;
416 else
417 break;
418 }
419 *px = 0;
420 unput(c);
421 sscanf(xbuf, "%x", &n);
422 *bp++ = n;
423 break;
424 }
425
426 default:
427 *bp++ = c;
428 break;
429 }
430 break;
431 default:
432 *bp++ = c;
433 break;
434 }
435 }
436 *bp = 0;
437 s = tostring(buf);
438 *bp++ = ' '; *bp++ = 0;
439 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
440 RET(STRING);
441}
442
443
444int binsearch(char *w, Keyword *kp, int n)
445{
446 int cond, low, mid, high;
447
448 low = 0;
449 high = n - 1;
450 while (low <= high) {
451 mid = (low + high) / 2;
452 if ((cond = strcmp(w, kp[mid].word)) < 0)
453 high = mid - 1;
454 else if (cond > 0)
455 low = mid + 1;
456 else
457 return mid;
458 }
459 return -1;
460}
461
462int word(char *w)
463{
464 Keyword *kp;
465 int c, n;
466
467 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
468 kp = keywords + n;
469 if (n != -1) { /* found in table */
470 yylval.i = kp->sub;
471 switch (kp->type) { /* special handling */
472 case FSYSTEM:
473 if (safe)
474 SYNTAX( "system is unsafe" );
475 RET(kp->type);
476 case FUNC:
477 if (infunc)
478 SYNTAX( "illegal nested function" );
479 RET(kp->type);
480 case RETURN:
481 if (!infunc)
482 SYNTAX( "return not in function" );
483 RET(kp->type);
484 case VARNF:
485 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
486 RET(VARNF);
487 default:
488 RET(kp->type);
489 }
490 }
491 c = peek(); /* look for '(' */
492 if (c != '(' && infunc && (n=isarg(w)) >= 0) {
493 yylval.i = n;
494 RET(ARG);
495 } else {
496 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
497 if (c == '(') {
498 RET(CALL);
499 } else {
500 RET(VAR);
501 }
502 }
503}
504
505void startreg(void) /* next call to yylex will return a regular expression */
506{
507 reg = 1;
508}
509
510int regexpr(void)
511{
512 int c;
513 static char *buf = 0;
514 static int bufsz = 500;
515 char *bp;
516
517 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
518 FATAL("out of space for rex expr");
519 bp = buf;
520 for ( ; (c = input()) != '/' && c != 0; ) {
521 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
522 FATAL("out of space for reg expr %.10s...", buf);
523 if (c == '\n') {
524 SYNTAX( "newline in regular expression %.10s...", buf );
525 unput('\n');
526 break;
527 } else if (c == '\\') {
528 *bp++ = '\\';
529 *bp++ = input();
530 } else {
531 *bp++ = c;
532 }
533 }
534 *bp = 0;
535 if (c == 0)
536 SYNTAX("non-terminated regular expression %.10s...", buf);
537 yylval.s = tostring(buf);
538 unput('/');
539 RET(REGEXPR);
540}
541
542/* low-level lexical stuff, sort of inherited from lex */
543
544char ebuf[300];
545char *ep = ebuf;
546char yysbuf[100]; /* pushback buffer */
547char *yysptr = yysbuf;
548FILE *yyin = 0;
549
550int input(void) /* get next lexical input character */
551{
552 int c;
553 extern char *lexprog;
554
555 if (yysptr > yysbuf)
556 c = (uschar)*--yysptr;
557 else if (lexprog != NULL) { /* awk '...' */
558 if ((c = (uschar)*lexprog) != 0)
559 lexprog++;
560 } else /* awk -f ... */
561 c = pgetc();
562 if (c == '\n')
563 lineno++;
564 else if (c == EOF)
565 c = 0;
566 if (ep >= ebuf + sizeof ebuf)
567 ep = ebuf;
568 return *ep++ = c;
569}
570
571void unput(int c) /* put lexical character back on input */
572{
573 if (c == '\n')
574 lineno--;
575 if (yysptr >= yysbuf + sizeof(yysbuf))
576 FATAL("pushed back too much: %.20s...", yysbuf);
577 *yysptr++ = c;
578 if (--ep < ebuf)
579 ep = ebuf + sizeof(ebuf) - 1;
580}
581
582void unputstr(const char *s) /* put a string back on input */
583{
584 int i;
585
586 for (i = strlen(s)-1; i >= 0; i--)
587 unput(s[i]);
588}