contrib/gcc-4.4/libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009
   3    Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 /* Returns with a logical line that contains no escaped newlines or
 100    trigraphs.  This is a time-critical inner loop.  */
 101 void
 102 _cpp_clean_line (cpp_reader *pfile)
 103 {
 104   cpp_buffer *buffer;
 105   const uchar *s;
 106   uchar c, *d, *p;
 107
 108   buffer = pfile->buffer;
 109   buffer->cur_note = buffer->notes_used = 0;
 110   buffer->cur = buffer->line_base = buffer->next_line;
 111   buffer->need_line = false;
 112   s = buffer->next_line - 1;
 113
 114   if (!buffer->from_stage3)
 115     {
 116       const uchar *pbackslash = NULL;
 117
 118       /* Short circuit for the common case of an un-escaped line with
 119          no trigraphs.  The primary win here is by not writing any
 120          data back to memory until we have to.  */
 121       for (;;)
 122         {
 123           c = *++s;
 124           if (__builtin_expect (c == '\n', false)
 125               || __builtin_expect (c == '\r', false))
 126             {
 127               d = (uchar *) s;
 128
 129               if (__builtin_expect (s == buffer->rlimit, false))
 130                 goto done;
 131
 132               /* DOS line ending? */
 133               if (__builtin_expect (c == '\r', false)
 134                   && s[1] == '\n')
 135                 {
 136                   s++;
 137                   if (s == buffer->rlimit)
 138                     goto done;
 139                 }
 140
 141               if (__builtin_expect (pbackslash == NULL, true))
 142                 goto done;
 143
 144               /* Check for escaped newline.  */
 145               p = d;
 146               while (is_nvspace (p[-1]))
 147                 p--;
 148               if (p - 1 != pbackslash)
 149                 goto done;
 150
 151               /* Have an escaped newline; process it and proceed to
 152                  the slow path.  */
 153               add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 154               d = p - 2;
 155               buffer->next_line = p - 1;
 156               break;
 157             }
 158           if (__builtin_expect (c == '\\', false))
 159             pbackslash = s;
 160           else if (__builtin_expect (c == '?', false)
 161                    && __builtin_expect (s[1] == '?', false)
 162                    && _cpp_trigraph_map[s[2]])
 163             {
 164               /* Have a trigraph.  We may or may not have to convert
 165                  it.  Add a line note regardless, for -Wtrigraphs.  */
 166               add_line_note (buffer, s, s[2]);
 167               if (CPP_OPTION (pfile, trigraphs))
 168                 {
 169                   /* We do, and that means we have to switch to the
 170                      slow path.  */
 171                   d = (uchar *) s;
 172                   *d = _cpp_trigraph_map[s[2]];
 173                   s += 2;
 174                   break;
 175                 }
 176             }
 177         }
 178
 179
 180       for (;;)
 181         {
 182           c = *++s;
 183           *++d = c;
 184
 185           if (c == '\n' || c == '\r')
 186             {
 187                   /* Handle DOS line endings.  */
 188               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 189                 s++;
 190               if (s == buffer->rlimit)
 191                 break;
 192
 193               /* Escaped?  */
 194               p = d;
 195               while (p != buffer->next_line && is_nvspace (p[-1]))
 196                 p--;
 197               if (p == buffer->next_line || p[-1] != '\\')
 198                 break;
 199
 200               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 201               d = p - 2;
 202               buffer->next_line = p - 1;
 203             }
 204           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 205             {
 206               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 207               add_line_note (buffer, d, s[2]);
 208               if (CPP_OPTION (pfile, trigraphs))
 209                 {
 210                   *d = _cpp_trigraph_map[s[2]];
 211                   s += 2;
 212                 }
 213             }
 214         }
 215     }
 216   else
 217     {
 218       do
 219         s++;
 220       while (*s != '\n' && *s != '\r');
 221       d = (uchar *) s;
 222
 223       /* Handle DOS line endings.  */
 224       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 225         s++;
 226     }
 227
 228  done:
 229   *d = '\n';
 230   /* A sentinel note that should never be processed.  */
 231   add_line_note (buffer, d + 1, '\n');
 232   buffer->next_line = s + 1;
 233 }
 234
 235 /* Return true if the trigraph indicated by NOTE should be warned
 236    about in a comment.  */
 237 static bool
 238 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 239 {
 240   const uchar *p;
 241
 242   /* Within comments we don't warn about trigraphs, unless the
 243      trigraph forms an escaped newline, as that may change
 244      behavior.  */
 245   if (note->type != '/')
 246     return false;
 247
 248   /* If -trigraphs, then this was an escaped newline iff the next note
 249      is coincident.  */
 250   if (CPP_OPTION (pfile, trigraphs))
 251     return note[1].pos == note->pos;
 252
 253   /* Otherwise, see if this forms an escaped newline.  */
 254   p = note->pos + 3;
 255   while (is_nvspace (*p))
 256     p++;
 257
 258   /* There might have been escaped newlines between the trigraph and the
 259      newline we found.  Hence the position test.  */
 260   return (*p == '\n' && p < note[1].pos);
 261 }
 262
 263 /* Process the notes created by add_line_note as far as the current
 264    location.  */
 265 void
 266 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 267 {
 268   cpp_buffer *buffer = pfile->buffer;
 269
 270   for (;;)
 271     {
 272       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 273       unsigned int col;
 274
 275       if (note->pos > buffer->cur)
 276         break;
 277
 278       buffer->cur_note++;
 279       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 280
 281       if (note->type == '\\' || note->type == ' ')
 282         {
 283           if (note->type == ' ' && !in_comment)
 284             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 285                                  "backslash and newline separated by space");
 286
 287           if (buffer->next_line > buffer->rlimit)
 288             {
 289               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 290                                    "backslash-newline at end of file");
 291               /* Prevent "no newline at end of file" warning.  */
 292               buffer->next_line = buffer->rlimit;
 293             }
 294
 295           buffer->line_base = note->pos;
 296           CPP_INCREMENT_LINE (pfile, 0);
 297         }
 298       else if (_cpp_trigraph_map[note->type])
 299         {
 300           if (CPP_OPTION (pfile, warn_trigraphs)
 301               && (!in_comment || warn_in_comment (pfile, note)))
 302             {
 303               if (CPP_OPTION (pfile, trigraphs))
 304                 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 305                                      "trigraph ??%c converted to %c",
 306                                      note->type,
 307                                      (int) _cpp_trigraph_map[note->type]);
 308               else
 309                 {
 310                   cpp_error_with_line
 311                     (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 312                      "trigraph ??%c ignored, use -trigraphs to enable",
 313                      note->type);
 314                 }
 315             }
 316         }
 317       else
 318         abort ();
 319     }
 320 }
 321
 322 /* Skip a C-style block comment.  We find the end of the comment by
 323    seeing if an asterisk is before every '/' we encounter.  Returns
 324    nonzero if comment terminated by EOF, zero otherwise.
 325
 326    Buffer->cur points to the initial asterisk of the comment.  */
 327 bool
 328 _cpp_skip_block_comment (cpp_reader *pfile)
 329 {
 330   cpp_buffer *buffer = pfile->buffer;
 331   const uchar *cur = buffer->cur;
 332   uchar c;
 333
 334   cur++;
 335   if (*cur == '/')
 336     cur++;
 337
 338   for (;;)
 339     {
 340       /* People like decorating comments with '*', so check for '/'
 341          instead for efficiency.  */
 342       c = *cur++;
 343
 344       if (c == '/')
 345         {
 346           if (cur[-2] == '*')
 347             break;
 348
 349           /* Warn about potential nested comments, but not if the '/'
 350              comes immediately before the true comment delimiter.
 351              Don't bother to get it right across escaped newlines.  */
 352           if (CPP_OPTION (pfile, warn_comments)
 353               && cur[0] == '*' && cur[1] != '/')
 354             {
 355               buffer->cur = cur;
 356               cpp_error_with_line (pfile, CPP_DL_WARNING,
 357                                    pfile->line_table->highest_line, CPP_BUF_COL (buffer),
 358                                    "\"/*\" within comment");
 359             }
 360         }
 361       else if (c == '\n')
 362         {
 363           unsigned int cols;
 364           buffer->cur = cur - 1;
 365           _cpp_process_line_notes (pfile, true);
 366           if (buffer->next_line >= buffer->rlimit)
 367             return true;
 368           _cpp_clean_line (pfile);
 369
 370           cols = buffer->next_line - buffer->line_base;
 371           CPP_INCREMENT_LINE (pfile, cols);
 372
 373           cur = buffer->cur;
 374         }
 375     }
 376
 377   buffer->cur = cur;
 378   _cpp_process_line_notes (pfile, true);
 379   return false;
 380 }
 381
 382 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 383    terminating newline.  Handles escaped newlines.  Returns nonzero
 384    if a multiline comment.  */
 385 static int
 386 skip_line_comment (cpp_reader *pfile)
 387 {
 388   cpp_buffer *buffer = pfile->buffer;
 389   source_location orig_line = pfile->line_table->highest_line;
 390
 391   while (*buffer->cur != '\n')
 392     buffer->cur++;
 393
 394   _cpp_process_line_notes (pfile, true);
 395   return orig_line != pfile->line_table->highest_line;
 396 }
 397
 398 /* Skips whitespace, saving the next non-whitespace character.  */
 399 static void
 400 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 401 {
 402   cpp_buffer *buffer = pfile->buffer;
 403   bool saw_NUL = false;
 404
 405   do
 406     {
 407       /* Horizontal space always OK.  */
 408       if (c == ' ' || c == '\t')
 409         ;
 410       /* Just \f \v or \0 left.  */
 411       else if (c == '\0')
 412         saw_NUL = true;
 413       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 414         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 415                              CPP_BUF_COL (buffer),
 416                              "%s in preprocessing directive",
 417                              c == '\f' ? "form feed" : "vertical tab");
 418
 419       c = *buffer->cur++;
 420     }
 421   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 422   while (is_nvspace (c));
 423
 424   if (saw_NUL)
 425     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 426
 427   buffer->cur--;
 428 }
 429
 430 /* See if the characters of a number token are valid in a name (no
 431    '.', '+' or '-').  */
 432 static int
 433 name_p (cpp_reader *pfile, const cpp_string *string)
 434 {
 435   unsigned int i;
 436
 437   for (i = 0; i < string->len; i++)
 438     if (!is_idchar (string->text[i]))
 439       return 0;
 440
 441   return 1;
 442 }
 443
 444 /* After parsing an identifier or other sequence, produce a warning about
 445    sequences not in NFC/NFKC.  */
 446 static void
 447 warn_about_normalization (cpp_reader *pfile,
 448                           const cpp_token *token,
 449                           const struct normalize_state *s)
 450 {
 451   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
 452       && !pfile->state.skipping)
 453     {
 454       /* Make sure that the token is printed using UCNs, even
 455          if we'd otherwise happily print UTF-8.  */
 456       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
 457       size_t sz;
 458
 459       sz = cpp_spell_token (pfile, token, buf, false) - buf;
 460       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 461         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 462                              "`%.*s' is not in NFKC", (int) sz, buf);
 463       else
 464         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 465                              "`%.*s' is not in NFC", (int) sz, buf);
 466     }
 467 }
 468
 469 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
 470    an identifier.  FIRST is TRUE if this starts an identifier.  */
 471 static bool
 472 forms_identifier_p (cpp_reader *pfile, int first,
 473                     struct normalize_state *state)
 474 {
 475   cpp_buffer *buffer = pfile->buffer;
 476
 477   if (*buffer->cur == '$')
 478     {
 479       if (!CPP_OPTION (pfile, dollars_in_ident))
 480         return false;
 481
 482       buffer->cur++;
 483       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
 484         {
 485           CPP_OPTION (pfile, warn_dollars) = 0;
 486           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 487         }
 488
 489       return true;
 490     }
 491
 492   /* Is this a syntactically valid UCN?  */
 493   if (CPP_OPTION (pfile, extended_identifiers)
 494       && *buffer->cur == '\\'
 495       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
 496     {
 497       buffer->cur += 2;
 498       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 499                           state))
 500         return true;
 501       buffer->cur -= 2;
 502     }
 503
 504   return false;
 505 }
 506
 507 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
 508 static cpp_hashnode *
 509 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
 510 {
 511   cpp_hashnode *result;
 512   const uchar *cur;
 513   unsigned int len;
 514   unsigned int hash = HT_HASHSTEP (0, *base);
 515
 516   cur = base + 1;
 517   while (ISIDNUM (*cur))
 518     {
 519       hash = HT_HASHSTEP (hash, *cur);
 520       cur++;
 521     }
 522   len = cur - base;
 523   hash = HT_HASHFINISH (hash, len);
 524   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
 525                                               base, len, hash, HT_ALLOC));
 526
 527   /* Rarely, identifiers require diagnostics when lexed.  */
 528   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 529                         && !pfile->state.skipping, 0))
 530     {
 531       /* It is allowed to poison the same identifier twice.  */
 532       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 533         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
 534                    NODE_NAME (result));
 535
 536       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 537          replacement list of a variadic macro.  */
 538       if (result == pfile->spec_nodes.n__VA_ARGS__
 539           && !pfile->state.va_args_ok)
 540         cpp_error (pfile, CPP_DL_PEDWARN,
 541                    "__VA_ARGS__ can only appear in the expansion"
 542                    " of a C99 variadic macro");
 543     }
 544
 545   return result;
 546 }
 547
 548 /* Get the cpp_hashnode of an identifier specified by NAME in
 549    the current cpp_reader object.  If none is found, NULL is returned.  */
 550 cpp_hashnode *
 551 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
 552 {
 553   cpp_hashnode *result;
 554   result = lex_identifier_intern (pfile, (uchar *) name);
 555   return result;
 556 }
 557
 558 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 559 static cpp_hashnode *
 560 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
 561                 struct normalize_state *nst)
 562 {
 563   cpp_hashnode *result;
 564   const uchar *cur;
 565   unsigned int len;
 566   unsigned int hash = HT_HASHSTEP (0, *base);
 567
 568   cur = pfile->buffer->cur;
 569   if (! starts_ucn)
 570     while (ISIDNUM (*cur))
 571       {
 572         hash = HT_HASHSTEP (hash, *cur);
 573         cur++;
 574       }
 575   pfile->buffer->cur = cur;
 576   if (starts_ucn || forms_identifier_p (pfile, false, nst))
 577     {
 578       /* Slower version for identifiers containing UCNs (or $).  */
 579       do {
 580         while (ISIDNUM (*pfile->buffer->cur))
 581           {
 582             pfile->buffer->cur++;
 583             NORMALIZE_STATE_UPDATE_IDNUM (nst);
 584           }
 585       } while (forms_identifier_p (pfile, false, nst));
 586       result = _cpp_interpret_identifier (pfile, base,
 587                                           pfile->buffer->cur - base);
 588     }
 589   else
 590     {
 591       len = cur - base;
 592       hash = HT_HASHFINISH (hash, len);
 593
 594       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
 595                                                   base, len, hash, HT_ALLOC));
 596     }
 597
 598   /* Rarely, identifiers require diagnostics when lexed.  */
 599   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 600                         && !pfile->state.skipping, 0))
 601     {
 602       /* It is allowed to poison the same identifier twice.  */
 603       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 604         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
 605                    NODE_NAME (result));
 606
 607       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 608          replacement list of a variadic macro.  */
 609       if (result == pfile->spec_nodes.n__VA_ARGS__
 610           && !pfile->state.va_args_ok)
 611         cpp_error (pfile, CPP_DL_PEDWARN,
 612                    "__VA_ARGS__ can only appear in the expansion"
 613                    " of a C99 variadic macro");
 614     }
 615
 616   return result;
 617 }
 618
 619 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
 620 static void
 621 lex_number (cpp_reader *pfile, cpp_string *number,
 622             struct normalize_state *nst)
 623 {
 624   const uchar *cur;
 625   const uchar *base;
 626   uchar *dest;
 627
 628   base = pfile->buffer->cur - 1;
 629   do
 630     {
 631       cur = pfile->buffer->cur;
 632
 633       /* N.B. ISIDNUM does not include $.  */
 634       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
 635         {
 636           cur++;
 637           NORMALIZE_STATE_UPDATE_IDNUM (nst);
 638         }
 639
 640       pfile->buffer->cur = cur;
 641     }
 642   while (forms_identifier_p (pfile, false, nst));
 643
 644   number->len = cur - base;
 645   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
 646   memcpy (dest, base, number->len);
 647   dest[number->len] = '\0';
 648   number->text = dest;
 649 }
 650
 651 /* Create a token of type TYPE with a literal spelling.  */
 652 static void
 653 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
 654                 unsigned int len, enum cpp_ttype type)
 655 {
 656   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
 657
 658   memcpy (dest, base, len);
 659   dest[len] = '\0';
 660   token->type = type;
 661   token->val.str.len = len;
 662   token->val.str.text = dest;
 663 }
 664
 665 /* Lexes a string, character constant, or angle-bracketed header file
 666    name.  The stored string contains the spelling, including opening
 667    quote and leading any leading 'L', 'u' or 'U'.  It returns the type
 668    of the literal, or CPP_OTHER if it was not properly terminated, or
 669    CPP_LESS for an unterminated header name which must be relexed as
 670    normal tokens.
 671
 672    The spelling is NUL-terminated, but it is not guaranteed that this
 673    is the first NUL since embedded NULs are preserved.  */
 674 static void
 675 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 676 {
 677   bool saw_NUL = false;
 678   const uchar *cur;
 679   cppchar_t terminator;
 680   enum cpp_ttype type;
 681
 682   cur = base;
 683   terminator = *cur++;
 684   if (terminator == 'L' || terminator == 'u' || terminator == 'U')
 685     terminator = *cur++;
 686   if (terminator == '\"')
 687     type = (*base == 'L' ? CPP_WSTRING :
 688             *base == 'U' ? CPP_STRING32 :
 689             *base == 'u' ? CPP_STRING16 : CPP_STRING);
 690   else if (terminator == '\'')
 691     type = (*base == 'L' ? CPP_WCHAR :
 692             *base == 'U' ? CPP_CHAR32 :
 693             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
 694   else
 695     terminator = '>', type = CPP_HEADER_NAME;
 696
 697   for (;;)
 698     {
 699       cppchar_t c = *cur++;
 700
 701       /* In #include-style directives, terminators are not escapable.  */
 702       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
 703         cur++;
 704       else if (c == terminator)
 705         break;
 706       else if (c == '\n')
 707         {
 708           cur--;
 709           /* Unmatched quotes always yield undefined behavior, but
 710              greedy lexing means that what appears to be an unterminated
 711              header name may actually be a legitimate sequence of tokens.  */
 712           if (terminator == '>')
 713             {
 714               token->type = CPP_LESS;
 715               return;
 716             }
 717           type = CPP_OTHER;
 718           break;
 719         }
 720       else if (c == '\0')
 721         saw_NUL = true;
 722     }
 723
 724   if (saw_NUL && !pfile->state.skipping)
 725     cpp_error (pfile, CPP_DL_WARNING,
 726                "null character(s) preserved in literal");
 727
 728   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
 729     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
 730                (int) terminator);
 731
 732   pfile->buffer->cur = cur;
 733   create_literal (pfile, token, base, cur - base, type);
 734 }
 735
 736 /* Return the comment table. The client may not make any assumption
 737    about the ordering of the table.  */
 738 cpp_comment_table *
 739 cpp_get_comments (cpp_reader *pfile)
 740 {
 741   return &pfile->comments;
 742 }
 743
 744 /* Append a comment to the end of the comment table. */
 745 static void
 746 store_comment (cpp_reader *pfile, cpp_token *token)
 747 {
 748   int len;
 749
 750   if (pfile->comments.allocated == 0)
 751     {
 752       pfile->comments.allocated = 256;
 753       pfile->comments.entries = (cpp_comment *) xmalloc
 754         (pfile->comments.allocated * sizeof (cpp_comment));
 755     }
 756
 757   if (pfile->comments.count == pfile->comments.allocated)
 758     {
 759       pfile->comments.allocated *= 2;
 760       pfile->comments.entries = (cpp_comment *) xrealloc
 761         (pfile->comments.entries,
 762          pfile->comments.allocated * sizeof (cpp_comment));
 763     }
 764
 765   len = token->val.str.len;
 766
 767   /* Copy comment. Note, token may not be NULL terminated. */
 768   pfile->comments.entries[pfile->comments.count].comment =
 769     (char *) xmalloc (sizeof (char) * (len + 1));
 770   memcpy (pfile->comments.entries[pfile->comments.count].comment,
 771           token->val.str.text, len);
 772   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
 773
 774   /* Set source location. */
 775   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
 776
 777   /* Increment the count of entries in the comment table. */
 778   pfile->comments.count++;
 779 }
 780
 781 /* The stored comment includes the comment start and any terminator.  */
 782 static void
 783 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
 784               cppchar_t type)
 785 {
 786   unsigned char *buffer;
 787   unsigned int len, clen;
 788
 789   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
 790
 791   /* C++ comments probably (not definitely) have moved past a new
 792      line, which we don't want to save in the comment.  */
 793   if (is_vspace (pfile->buffer->cur[-1]))
 794     len--;
 795
 796   /* If we are currently in a directive, then we need to store all
 797      C++ comments as C comments internally, and so we need to
 798      allocate a little extra space in that case.
 799
 800      Note that the only time we encounter a directive here is
 801      when we are saving comments in a "#define".  */
 802   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
 803
 804   buffer = _cpp_unaligned_alloc (pfile, clen);
 805
 806   token->type = CPP_COMMENT;
 807   token->val.str.len = clen;
 808   token->val.str.text = buffer;
 809
 810   buffer[0] = '/';
 811   memcpy (buffer + 1, from, len - 1);
 812
 813   /* Finish conversion to a C comment, if necessary.  */
 814   if (pfile->state.in_directive && type == '/')
 815     {
 816       buffer[1] = '*';
 817       buffer[clen - 2] = '*';
 818       buffer[clen - 1] = '/';
 819     }
 820
 821   /* Finally store this comment for use by clients of libcpp. */
 822   store_comment (pfile, token);
 823 }
 824
 825 /* Allocate COUNT tokens for RUN.  */
 826 void
 827 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
 828 {
 829   run->base = XNEWVEC (cpp_token, count);
 830   run->limit = run->base + count;
 831   run->next = NULL;
 832 }
 833
 834 /* Returns the next tokenrun, or creates one if there is none.  */
 835 static tokenrun *
 836 next_tokenrun (tokenrun *run)
 837 {
 838   if (run->next == NULL)
 839     {
 840       run->next = XNEW (tokenrun);
 841       run->next->prev = run;
 842       _cpp_init_tokenrun (run->next, 250);
 843     }
 844
 845   return run->next;
 846 }
 847
 848 /* Look ahead in the input stream.  */
 849 const cpp_token *
 850 cpp_peek_token (cpp_reader *pfile, int index)
 851 {
 852   cpp_context *context = pfile->context;
 853   const cpp_token *peektok;
 854   int count;
 855
 856   /* First, scan through any pending cpp_context objects.  */
 857   while (context->prev)
 858     {
 859       ptrdiff_t sz = (context->direct_p
 860                       ? LAST (context).token - FIRST (context).token
 861                       : LAST (context).ptoken - FIRST (context).ptoken);
 862
 863       if (index < (int) sz)
 864         return (context->direct_p
 865                 ? FIRST (context).token + index
 866                 : *(FIRST (context).ptoken + index));
 867
 868       index -= (int) sz;
 869       context = context->prev;
 870     }
 871
 872   /* We will have to read some new tokens after all (and do so
 873      without invalidating preceding tokens).  */
 874   count = index;
 875   pfile->keep_tokens++;
 876
 877   do
 878     {
 879       peektok = _cpp_lex_token (pfile);
 880       if (peektok->type == CPP_EOF)
 881         return peektok;
 882     }
 883   while (index--);
 884
 885   _cpp_backup_tokens_direct (pfile, count + 1);
 886   pfile->keep_tokens--;
 887
 888   return peektok;
 889 }
 890
 891 /* Allocate a single token that is invalidated at the same time as the
 892    rest of the tokens on the line.  Has its line and col set to the
 893    same as the last lexed token, so that diagnostics appear in the
 894    right place.  */
 895 cpp_token *
 896 _cpp_temp_token (cpp_reader *pfile)
 897 {
 898   cpp_token *old, *result;
 899   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
 900   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
 901
 902   old = pfile->cur_token - 1;
 903   /* Any pre-existing lookaheads must not be clobbered.  */
 904   if (la)
 905     {
 906       if (sz <= la)
 907         {
 908           tokenrun *next = next_tokenrun (pfile->cur_run);
 909
 910           if (sz < la)
 911             memmove (next->base + 1, next->base,
 912                      (la - sz) * sizeof (cpp_token));
 913
 914           next->base[0] = pfile->cur_run->limit[-1];
 915         }
 916
 917       if (sz > 1)
 918         memmove (pfile->cur_token + 1, pfile->cur_token,
 919                  MIN (la, sz - 1) * sizeof (cpp_token));
 920     }
 921
 922   if (!sz && pfile->cur_token == pfile->cur_run->limit)
 923     {
 924       pfile->cur_run = next_tokenrun (pfile->cur_run);
 925       pfile->cur_token = pfile->cur_run->base;
 926     }
 927
 928   result = pfile->cur_token++;
 929   result->src_loc = old->src_loc;
 930   return result;
 931 }
 932
 933 /* Lex a token into RESULT (external interface).  Takes care of issues
 934    like directive handling, token lookahead, multiple include
 935    optimization and skipping.  */
 936 const cpp_token *
 937 _cpp_lex_token (cpp_reader *pfile)
 938 {
 939   cpp_token *result;
 940
 941   for (;;)
 942     {
 943       if (pfile->cur_token == pfile->cur_run->limit)
 944         {
 945           pfile->cur_run = next_tokenrun (pfile->cur_run);
 946           pfile->cur_token = pfile->cur_run->base;
 947         }
 948       /* We assume that the current token is somewhere in the current
 949          run.  */
 950       if (pfile->cur_token < pfile->cur_run->base
 951           || pfile->cur_token >= pfile->cur_run->limit)
 952         abort ();
 953
 954       if (pfile->lookaheads)
 955         {
 956           pfile->lookaheads--;
 957           result = pfile->cur_token++;
 958         }
 959       else
 960         result = _cpp_lex_direct (pfile);
 961
 962       if (result->flags & BOL)
 963         {
 964           /* Is this a directive.  If _cpp_handle_directive returns
 965              false, it is an assembler #.  */
 966           if (result->type == CPP_HASH
 967               /* 6.10.3 p 11: Directives in a list of macro arguments
 968                  gives undefined behavior.  This implementation
 969                  handles the directive as normal.  */
 970               && pfile->state.parsing_args != 1)
 971             {
 972               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
 973                 {
 974                   if (pfile->directive_result.type == CPP_PADDING)
 975                     continue;
 976                   result = &pfile->directive_result;
 977                 }
 978             }
 979           else if (pfile->state.in_deferred_pragma)
 980             result = &pfile->directive_result;
 981
 982           if (pfile->cb.line_change && !pfile->state.skipping)
 983             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
 984         }
 985
 986       /* We don't skip tokens in directives.  */
 987       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
 988         break;
 989
 990       /* Outside a directive, invalidate controlling macros.  At file
 991          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
 992          get here and MI optimization works.  */
 993       pfile->mi_valid = false;
 994
 995       if (!pfile->state.skipping || result->type == CPP_EOF)
 996         break;
 997     }
 998
 999   return result;
1000 }
1001
1002 /* Returns true if a fresh line has been loaded.  */
1003 bool
1004 _cpp_get_fresh_line (cpp_reader *pfile)
1005 {
1006   int return_at_eof;
1007
1008   /* We can't get a new line until we leave the current directive.  */
1009   if (pfile->state.in_directive)
1010     return false;
1011
1012   for (;;)
1013     {
1014       cpp_buffer *buffer = pfile->buffer;
1015
1016       if (!buffer->need_line)
1017         return true;
1018
1019       if (buffer->next_line < buffer->rlimit)
1020         {
1021           _cpp_clean_line (pfile);
1022           return true;
1023         }
1024
1025       /* First, get out of parsing arguments state.  */
1026       if (pfile->state.parsing_args)
1027         return false;
1028
1029       /* End of buffer.  Non-empty files should end in a newline.  */
1030       if (buffer->buf != buffer->rlimit
1031           && buffer->next_line > buffer->rlimit
1032           && !buffer->from_stage3)
1033         {
1034           /* Clip to buffer size.  */
1035           buffer->next_line = buffer->rlimit;
1036         }
1037
1038       return_at_eof = buffer->return_at_eof;
1039       _cpp_pop_buffer (pfile);
1040       if (pfile->buffer == NULL || return_at_eof)
1041         return false;
1042     }
1043 }
1044
1045 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
1046   do                                                    \
1047     {                                                   \
1048       result->type = ELSE_TYPE;                         \
1049       if (*buffer->cur == CHAR)                         \
1050         buffer->cur++, result->type = THEN_TYPE;        \
1051     }                                                   \
1052   while (0)
1053
1054 /* Lex a token into pfile->cur_token, which is also incremented, to
1055    get diagnostics pointing to the correct location.
1056
1057    Does not handle issues such as token lookahead, multiple-include
1058    optimization, directives, skipping etc.  This function is only
1059    suitable for use by _cpp_lex_token, and in special cases like
1060    lex_expansion_token which doesn't care for any of these issues.
1061
1062    When meeting a newline, returns CPP_EOF if parsing a directive,
1063    otherwise returns to the start of the token buffer if permissible.
1064    Returns the location of the lexed token.  */
1065 cpp_token *
1066 _cpp_lex_direct (cpp_reader *pfile)
1067 {
1068   cppchar_t c;
1069   cpp_buffer *buffer;
1070   const unsigned char *comment_start;
1071   cpp_token *result = pfile->cur_token++;
1072
1073  fresh_line:
1074   result->flags = 0;
1075   buffer = pfile->buffer;
1076   if (buffer->need_line)
1077     {
1078       if (pfile->state.in_deferred_pragma)
1079         {
1080           result->type = CPP_PRAGMA_EOL;
1081           pfile->state.in_deferred_pragma = false;
1082           if (!pfile->state.pragma_allow_expansion)
1083             pfile->state.prevent_expansion--;
1084           return result;
1085         }
1086       if (!_cpp_get_fresh_line (pfile))
1087         {
1088           result->type = CPP_EOF;
1089           if (!pfile->state.in_directive)
1090             {
1091               /* Tell the compiler the line number of the EOF token.  */
1092               result->src_loc = pfile->line_table->highest_line;
1093               result->flags = BOL;
1094             }
1095           return result;
1096         }
1097       if (!pfile->keep_tokens)
1098         {
1099           pfile->cur_run = &pfile->base_run;
1100           result = pfile->base_run.base;
1101           pfile->cur_token = result + 1;
1102         }
1103       result->flags = BOL;
1104       if (pfile->state.parsing_args == 2)
1105         result->flags |= PREV_WHITE;
1106     }
1107   buffer = pfile->buffer;
1108  update_tokens_line:
1109   result->src_loc = pfile->line_table->highest_line;
1110
1111  skipped_white:
1112   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1113       && !pfile->overlaid_buffer)
1114     {
1115       _cpp_process_line_notes (pfile, false);
1116       result->src_loc = pfile->line_table->highest_line;
1117     }
1118   c = *buffer->cur++;
1119
1120   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1121                                CPP_BUF_COLUMN (buffer, buffer->cur));
1122
1123   switch (c)
1124     {
1125     case ' ': case '\t': case '\f': case '\v': case '\0':
1126       result->flags |= PREV_WHITE;
1127       skip_whitespace (pfile, c);
1128       goto skipped_white;
1129
1130     case '\n':
1131       if (buffer->cur < buffer->rlimit)
1132         CPP_INCREMENT_LINE (pfile, 0);
1133       buffer->need_line = true;
1134       goto fresh_line;
1135
1136     case '0': case '1': case '2': case '3': case '4':
1137     case '5': case '6': case '7': case '8': case '9':
1138       {
1139         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1140         result->type = CPP_NUMBER;
1141         lex_number (pfile, &result->val.str, &nst);
1142         warn_about_normalization (pfile, result, &nst);
1143         break;
1144       }
1145
1146     case 'L':
1147     case 'u':
1148     case 'U':
1149       /* 'L', 'u' or 'U' may introduce wide characters or strings.  */
1150       if (c == 'L' || CPP_OPTION (pfile, uliterals))
1151         {
1152           if (*buffer->cur == '\'' || *buffer->cur == '"')
1153             {
1154               lex_string (pfile, result, buffer->cur - 1);
1155               break;
1156             }
1157         }
1158       /* Fall through.  */
1159
1160     case '_':
1161     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1162     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1163     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1164     case 's': case 't':           case 'v': case 'w': case 'x':
1165     case 'y': case 'z':
1166     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1167     case 'G': case 'H': case 'I': case 'J': case 'K':
1168     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1169     case 'S': case 'T':           case 'V': case 'W': case 'X':
1170     case 'Y': case 'Z':
1171       result->type = CPP_NAME;
1172       {
1173         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1174         result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
1175                                            &nst);
1176         warn_about_normalization (pfile, result, &nst);
1177       }
1178
1179       /* Convert named operators to their proper types.  */
1180       if (result->val.node->flags & NODE_OPERATOR)
1181         {
1182           result->flags |= NAMED_OP;
1183           result->type = (enum cpp_ttype) result->val.node->directive_index;
1184         }
1185       break;
1186
1187     case '\'':
1188     case '"':
1189       lex_string (pfile, result, buffer->cur - 1);
1190       break;
1191
1192     case '/':
1193       /* A potential block or line comment.  */
1194       comment_start = buffer->cur;
1195       c = *buffer->cur;
1196
1197       if (c == '*')
1198         {
1199           if (_cpp_skip_block_comment (pfile))
1200             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1201         }
1202       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1203                             || cpp_in_system_header (pfile)))
1204         {
1205           /* Warn about comments only if pedantically GNUC89, and not
1206              in system headers.  */
1207           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1208               && ! buffer->warned_cplusplus_comments)
1209             {
1210               cpp_error (pfile, CPP_DL_PEDWARN,
1211                          "C++ style comments are not allowed in ISO C90");
1212               cpp_error (pfile, CPP_DL_PEDWARN,
1213                          "(this will be reported only once per input file)");
1214               buffer->warned_cplusplus_comments = 1;
1215             }
1216
1217           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1218             cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1219         }
1220       else if (c == '=')
1221         {
1222           buffer->cur++;
1223           result->type = CPP_DIV_EQ;
1224           break;
1225         }
1226       else
1227         {
1228           result->type = CPP_DIV;
1229           break;
1230         }
1231
1232       if (!pfile->state.save_comments)
1233         {
1234           result->flags |= PREV_WHITE;
1235           goto update_tokens_line;
1236         }
1237
1238       /* Save the comment as a token in its own right.  */
1239       save_comment (pfile, result, comment_start, c);
1240       break;
1241
1242     case '<':
1243       if (pfile->state.angled_headers)
1244         {
1245           lex_string (pfile, result, buffer->cur - 1);
1246           if (result->type != CPP_LESS)
1247             break;
1248         }
1249
1250       result->type = CPP_LESS;
1251       if (*buffer->cur == '=')
1252         buffer->cur++, result->type = CPP_LESS_EQ;
1253       else if (*buffer->cur == '<')
1254         {
1255           buffer->cur++;
1256           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1257         }
1258       else if (CPP_OPTION (pfile, digraphs))
1259         {
1260           if (*buffer->cur == ':')
1261             {
1262               buffer->cur++;
1263               result->flags |= DIGRAPH;
1264               result->type = CPP_OPEN_SQUARE;
1265             }
1266           else if (*buffer->cur == '%')
1267             {
1268               buffer->cur++;
1269               result->flags |= DIGRAPH;
1270               result->type = CPP_OPEN_BRACE;
1271             }
1272         }
1273       break;
1274
1275     case '>':
1276       result->type = CPP_GREATER;
1277       if (*buffer->cur == '=')
1278         buffer->cur++, result->type = CPP_GREATER_EQ;
1279       else if (*buffer->cur == '>')
1280         {
1281           buffer->cur++;
1282           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1283         }
1284       break;
1285
1286     case '%':
1287       result->type = CPP_MOD;
1288       if (*buffer->cur == '=')
1289         buffer->cur++, result->type = CPP_MOD_EQ;
1290       else if (CPP_OPTION (pfile, digraphs))
1291         {
1292           if (*buffer->cur == ':')
1293             {
1294               buffer->cur++;
1295               result->flags |= DIGRAPH;
1296               result->type = CPP_HASH;
1297               if (*buffer->cur == '%' && buffer->cur[1] == ':')
1298                 buffer->cur += 2, result->type = CPP_PASTE;
1299             }
1300           else if (*buffer->cur == '>')
1301             {
1302               buffer->cur++;
1303               result->flags |= DIGRAPH;
1304               result->type = CPP_CLOSE_BRACE;
1305             }
1306         }
1307       break;
1308
1309     case '.':
1310       result->type = CPP_DOT;
1311       if (ISDIGIT (*buffer->cur))
1312         {
1313           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1314           result->type = CPP_NUMBER;
1315           lex_number (pfile, &result->val.str, &nst);
1316           warn_about_normalization (pfile, result, &nst);
1317         }
1318       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1319         buffer->cur += 2, result->type = CPP_ELLIPSIS;
1320       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1321         buffer->cur++, result->type = CPP_DOT_STAR;
1322       break;
1323
1324     case '+':
1325       result->type = CPP_PLUS;
1326       if (*buffer->cur == '+')
1327         buffer->cur++, result->type = CPP_PLUS_PLUS;
1328       else if (*buffer->cur == '=')
1329         buffer->cur++, result->type = CPP_PLUS_EQ;
1330       break;
1331
1332     case '-':
1333       result->type = CPP_MINUS;
1334       if (*buffer->cur == '>')
1335         {
1336           buffer->cur++;
1337           result->type = CPP_DEREF;
1338           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1339             buffer->cur++, result->type = CPP_DEREF_STAR;
1340         }
1341       else if (*buffer->cur == '-')
1342         buffer->cur++, result->type = CPP_MINUS_MINUS;
1343       else if (*buffer->cur == '=')
1344         buffer->cur++, result->type = CPP_MINUS_EQ;
1345       break;
1346
1347     case '&':
1348       result->type = CPP_AND;
1349       if (*buffer->cur == '&')
1350         buffer->cur++, result->type = CPP_AND_AND;
1351       else if (*buffer->cur == '=')
1352         buffer->cur++, result->type = CPP_AND_EQ;
1353       break;
1354
1355     case '|':
1356       result->type = CPP_OR;
1357       if (*buffer->cur == '|')
1358         buffer->cur++, result->type = CPP_OR_OR;
1359       else if (*buffer->cur == '=')
1360         buffer->cur++, result->type = CPP_OR_EQ;
1361       break;
1362
1363     case ':':
1364       result->type = CPP_COLON;
1365       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1366         buffer->cur++, result->type = CPP_SCOPE;
1367       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1368         {
1369           buffer->cur++;
1370           result->flags |= DIGRAPH;
1371           result->type = CPP_CLOSE_SQUARE;
1372         }
1373       break;
1374
1375     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1376     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1377     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1378     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1379     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1380
1381     case '?': result->type = CPP_QUERY; break;
1382     case '~': result->type = CPP_COMPL; break;
1383     case ',': result->type = CPP_COMMA; break;
1384     case '(': result->type = CPP_OPEN_PAREN; break;
1385     case ')': result->type = CPP_CLOSE_PAREN; break;
1386     case '[': result->type = CPP_OPEN_SQUARE; break;
1387     case ']': result->type = CPP_CLOSE_SQUARE; break;
1388     case '{': result->type = CPP_OPEN_BRACE; break;
1389     case '}': result->type = CPP_CLOSE_BRACE; break;
1390     case ';': result->type = CPP_SEMICOLON; break;
1391
1392       /* @ is a punctuator in Objective-C.  */
1393     case '@': result->type = CPP_ATSIGN; break;
1394
1395     case '$':
1396     case '\\':
1397       {
1398         const uchar *base = --buffer->cur;
1399         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1400
1401         if (forms_identifier_p (pfile, true, &nst))
1402           {
1403             result->type = CPP_NAME;
1404             result->val.node = lex_identifier (pfile, base, true, &nst);
1405             warn_about_normalization (pfile, result, &nst);
1406             break;
1407           }
1408         buffer->cur++;
1409       }
1410
1411     default:
1412       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1413       break;
1414     }
1415
1416   return result;
1417 }
1418
1419 /* An upper bound on the number of bytes needed to spell TOKEN.
1420    Does not include preceding whitespace.  */
1421 unsigned int
1422 cpp_token_len (const cpp_token *token)
1423 {
1424   unsigned int len;
1425
1426   switch (TOKEN_SPELL (token))
1427     {
1428     default:            len = 6;                                break;
1429     case SPELL_LITERAL: len = token->val.str.len;               break;
1430     case SPELL_IDENT:   len = NODE_LEN (token->val.node) * 10;  break;
1431     }
1432
1433   return len;
1434 }
1435
1436 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1437    Return the number of bytes read out of NAME.  (There are always
1438    10 bytes written to BUFFER.)  */
1439
1440 static size_t
1441 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1442 {
1443   int j;
1444   int ucn_len = 0;
1445   int ucn_len_c;
1446   unsigned t;
1447   unsigned long utf32;
1448
1449   /* Compute the length of the UTF-8 sequence.  */
1450   for (t = *name; t & 0x80; t <<= 1)
1451     ucn_len++;
1452
1453   utf32 = *name & (0x7F >> ucn_len);
1454   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1455     {
1456       utf32 = (utf32 << 6) | (*++name & 0x3F);
1457
1458       /* Ill-formed UTF-8.  */
1459       if ((*name & ~0x3F) != 0x80)
1460         abort ();
1461     }
1462
1463   *buffer++ = '\\';
1464   *buffer++ = 'U';
1465   for (j = 7; j >= 0; j--)
1466     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1467   return ucn_len;
1468 }
1469
1470
1471 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1472    already contain the enough space to hold the token's spelling.
1473    Returns a pointer to the character after the last character written.
1474    FORSTRING is true if this is to be the spelling after translation
1475    phase 1 (this is different for UCNs).
1476    FIXME: Would be nice if we didn't need the PFILE argument.  */
1477 unsigned char *
1478 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1479                  unsigned char *buffer, bool forstring)
1480 {
1481   switch (TOKEN_SPELL (token))
1482     {
1483     case SPELL_OPERATOR:
1484       {
1485         const unsigned char *spelling;
1486         unsigned char c;
1487
1488         if (token->flags & DIGRAPH)
1489           spelling
1490             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1491         else if (token->flags & NAMED_OP)
1492           goto spell_ident;
1493         else
1494           spelling = TOKEN_NAME (token);
1495
1496         while ((c = *spelling++) != '\0')
1497           *buffer++ = c;
1498       }
1499       break;
1500
1501     spell_ident:
1502     case SPELL_IDENT:
1503       if (forstring)
1504         {
1505           memcpy (buffer, NODE_NAME (token->val.node),
1506                   NODE_LEN (token->val.node));
1507           buffer += NODE_LEN (token->val.node);
1508         }
1509       else
1510         {
1511           size_t i;
1512           const unsigned char * name = NODE_NAME (token->val.node);
1513
1514           for (i = 0; i < NODE_LEN (token->val.node); i++)
1515             if (name[i] & ~0x7F)
1516               {
1517                 i += utf8_to_ucn (buffer, name + i) - 1;
1518                 buffer += 10;
1519               }
1520             else
1521               *buffer++ = NODE_NAME (token->val.node)[i];
1522         }
1523       break;
1524
1525     case SPELL_LITERAL:
1526       memcpy (buffer, token->val.str.text, token->val.str.len);
1527       buffer += token->val.str.len;
1528       break;
1529
1530     case SPELL_NONE:
1531       cpp_error (pfile, CPP_DL_ICE,
1532                  "unspellable token %s", TOKEN_NAME (token));
1533       break;
1534     }
1535
1536   return buffer;
1537 }
1538
1539 /* Returns TOKEN spelt as a null-terminated string.  The string is
1540    freed when the reader is destroyed.  Useful for diagnostics.  */
1541 unsigned char *
1542 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1543 {
1544   unsigned int len = cpp_token_len (token) + 1;
1545   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1546
1547   end = cpp_spell_token (pfile, token, start, false);
1548   end[0] = '\0';
1549
1550   return start;
1551 }
1552
1553 /* Used by C front ends, which really should move to using
1554    cpp_token_as_text.  */
1555 const char *
1556 cpp_type2name (enum cpp_ttype type)
1557 {
1558   return (const char *) token_spellings[type].name;
1559 }
1560
1561 /* Writes the spelling of token to FP, without any preceding space.
1562    Separated from cpp_spell_token for efficiency - to avoid stdio
1563    double-buffering.  */
1564 void
1565 cpp_output_token (const cpp_token *token, FILE *fp)
1566 {
1567   switch (TOKEN_SPELL (token))
1568     {
1569     case SPELL_OPERATOR:
1570       {
1571         const unsigned char *spelling;
1572         int c;
1573
1574         if (token->flags & DIGRAPH)
1575           spelling
1576             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1577         else if (token->flags & NAMED_OP)
1578           goto spell_ident;
1579         else
1580           spelling = TOKEN_NAME (token);
1581
1582         c = *spelling;
1583         do
1584           putc (c, fp);
1585         while ((c = *++spelling) != '\0');
1586       }
1587       break;
1588
1589     spell_ident:
1590     case SPELL_IDENT:
1591       {
1592         size_t i;
1593         const unsigned char * name = NODE_NAME (token->val.node);
1594
1595         for (i = 0; i < NODE_LEN (token->val.node); i++)
1596           if (name[i] & ~0x7F)
1597             {
1598               unsigned char buffer[10];
1599               i += utf8_to_ucn (buffer, name + i) - 1;
1600               fwrite (buffer, 1, 10, fp);
1601             }
1602           else
1603             fputc (NODE_NAME (token->val.node)[i], fp);
1604       }
1605       break;
1606
1607     case SPELL_LITERAL:
1608       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1609       break;
1610
1611     case SPELL_NONE:
1612       /* An error, most probably.  */
1613       break;
1614     }
1615 }
1616
1617 /* Compare two tokens.  */
1618 int
1619 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1620 {
1621   if (a->type == b->type && a->flags == b->flags)
1622     switch (TOKEN_SPELL (a))
1623       {
1624       default:                  /* Keep compiler happy.  */
1625       case SPELL_OPERATOR:
1626         return 1;
1627       case SPELL_NONE:
1628         return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1629       case SPELL_IDENT:
1630         return a->val.node == b->val.node;
1631       case SPELL_LITERAL:
1632         return (a->val.str.len == b->val.str.len
1633                 && !memcmp (a->val.str.text, b->val.str.text,
1634                             a->val.str.len));
1635       }
1636
1637   return 0;
1638 }
1639
1640 /* Returns nonzero if a space should be inserted to avoid an
1641    accidental token paste for output.  For simplicity, it is
1642    conservative, and occasionally advises a space where one is not
1643    needed, e.g. "." and ".2".  */
1644 int
1645 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1646                  const cpp_token *token2)
1647 {
1648   enum cpp_ttype a = token1->type, b = token2->type;
1649   cppchar_t c;
1650
1651   if (token1->flags & NAMED_OP)
1652     a = CPP_NAME;
1653   if (token2->flags & NAMED_OP)
1654     b = CPP_NAME;
1655
1656   c = EOF;
1657   if (token2->flags & DIGRAPH)
1658     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1659   else if (token_spellings[b].category == SPELL_OPERATOR)
1660     c = token_spellings[b].name[0];
1661
1662   /* Quickly get everything that can paste with an '='.  */
1663   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1664     return 1;
1665
1666   switch (a)
1667     {
1668     case CPP_GREATER:   return c == '>';
1669     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
1670     case CPP_PLUS:      return c == '+';
1671     case CPP_MINUS:     return c == '-' || c == '>';
1672     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1673     case CPP_MOD:       return c == ':' || c == '>';
1674     case CPP_AND:       return c == '&';
1675     case CPP_OR:        return c == '|';
1676     case CPP_COLON:     return c == ':' || c == '>';
1677     case CPP_DEREF:     return c == '*';
1678     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1679     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1680     case CPP_NAME:      return ((b == CPP_NUMBER
1681                                  && name_p (pfile, &token2->val.str))
1682                                 || b == CPP_NAME
1683                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1684     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1685                                 || c == '.' || c == '+' || c == '-');
1686                                       /* UCNs */
1687     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
1688                                  && b == CPP_NAME)
1689                                 || (CPP_OPTION (pfile, objc)
1690                                     && token1->val.str.text[0] == '@'
1691                                     && (b == CPP_NAME || b == CPP_STRING)));
1692     default:            break;
1693     }
1694
1695   return 0;
1696 }
1697
1698 /* Output all the remaining tokens on the current line, and a newline
1699    character, to FP.  Leading whitespace is removed.  If there are
1700    macros, special token padding is not performed.  */
1701 void
1702 cpp_output_line (cpp_reader *pfile, FILE *fp)
1703 {
1704   const cpp_token *token;
1705
1706   token = cpp_get_token (pfile);
1707   while (token->type != CPP_EOF)
1708     {
1709       cpp_output_token (token, fp);
1710       token = cpp_get_token (pfile);
1711       if (token->flags & PREV_WHITE)
1712         putc (' ', fp);
1713     }
1714
1715   putc ('\n', fp);
1716 }
1717
1718 /* Return a string representation of all the remaining tokens on the
1719    current line.  The result is allocated using xmalloc and must be
1720    freed by the caller.  */
1721 unsigned char *
1722 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
1723 {
1724   const cpp_token *token;
1725   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
1726   unsigned int alloced = 120 + out;
1727   unsigned char *result = (unsigned char *) xmalloc (alloced);
1728
1729   /* If DIR_NAME is empty, there are no initial contents.  */
1730   if (dir_name)
1731     {
1732       sprintf ((char *) result, "#%s ", dir_name);
1733       out += 2;
1734     }
1735
1736   token = cpp_get_token (pfile);
1737   while (token->type != CPP_EOF)
1738     {
1739       unsigned char *last;
1740       /* Include room for a possible space and the terminating nul.  */
1741       unsigned int len = cpp_token_len (token) + 2;
1742
1743       if (out + len > alloced)
1744         {
1745           alloced *= 2;
1746           if (out + len > alloced)
1747             alloced = out + len;
1748           result = (unsigned char *) xrealloc (result, alloced);
1749         }
1750
1751       last = cpp_spell_token (pfile, token, &result[out], 0);
1752       out = last - result;
1753
1754       token = cpp_get_token (pfile);
1755       if (token->flags & PREV_WHITE)
1756         result[out++] = ' ';
1757     }
1758
1759   result[out] = '\0';
1760   return result;
1761 }
1762
1763 /* Memory buffers.  Changing these three constants can have a dramatic
1764    effect on performance.  The values here are reasonable defaults,
1765    but might be tuned.  If you adjust them, be sure to test across a
1766    range of uses of cpplib, including heavy nested function-like macro
1767    expansion.  Also check the change in peak memory usage (NJAMD is a
1768    good tool for this).  */
1769 #define MIN_BUFF_SIZE 8000
1770 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1771 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1772         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1773
1774 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1775   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1776 #endif
1777
1778 /* Create a new allocation buffer.  Place the control block at the end
1779    of the buffer, so that buffer overflows will cause immediate chaos.  */
1780 static _cpp_buff *
1781 new_buff (size_t len)
1782 {
1783   _cpp_buff *result;
1784   unsigned char *base;
1785
1786   if (len < MIN_BUFF_SIZE)
1787     len = MIN_BUFF_SIZE;
1788   len = CPP_ALIGN (len);
1789
1790   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1791   result = (_cpp_buff *) (base + len);
1792   result->base = base;
1793   result->cur = base;
1794   result->limit = base + len;
1795   result->next = NULL;
1796   return result;
1797 }
1798
1799 /* Place a chain of unwanted allocation buffers on the free list.  */
1800 void
1801 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1802 {
1803   _cpp_buff *end = buff;
1804
1805   while (end->next)
1806     end = end->next;
1807   end->next = pfile->free_buffs;
1808   pfile->free_buffs = buff;
1809 }
1810
1811 /* Return a free buffer of size at least MIN_SIZE.  */
1812 _cpp_buff *
1813 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1814 {
1815   _cpp_buff *result, **p;
1816
1817   for (p = &pfile->free_buffs;; p = &(*p)->next)
1818     {
1819       size_t size;
1820
1821       if (*p == NULL)
1822         return new_buff (min_size);
1823       result = *p;
1824       size = result->limit - result->base;
1825       /* Return a buffer that's big enough, but don't waste one that's
1826          way too big.  */
1827       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1828         break;
1829     }
1830
1831   *p = result->next;
1832   result->next = NULL;
1833   result->cur = result->base;
1834   return result;
1835 }
1836
1837 /* Creates a new buffer with enough space to hold the uncommitted
1838    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1839    the excess bytes to the new buffer.  Chains the new buffer after
1840    BUFF, and returns the new buffer.  */
1841 _cpp_buff *
1842 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1843 {
1844   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1845   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1846
1847   buff->next = new_buff;
1848   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1849   return new_buff;
1850 }
1851
1852 /* Creates a new buffer with enough space to hold the uncommitted
1853    remaining bytes of the buffer pointed to by BUFF, and at least
1854    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1855    Chains the new buffer before the buffer pointed to by BUFF, and
1856    updates the pointer to point to the new buffer.  */
1857 void
1858 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1859 {
1860   _cpp_buff *new_buff, *old_buff = *pbuff;
1861   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1862
1863   new_buff = _cpp_get_buff (pfile, size);
1864   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1865   new_buff->next = old_buff;
1866   *pbuff = new_buff;
1867 }
1868
1869 /* Free a chain of buffers starting at BUFF.  */
1870 void
1871 _cpp_free_buff (_cpp_buff *buff)
1872 {
1873   _cpp_buff *next;
1874
1875   for (; buff; buff = next)
1876     {
1877       next = buff->next;
1878       free (buff->base);
1879     }
1880 }
1881
1882 /* Allocate permanent, unaligned storage of length LEN.  */
1883 unsigned char *
1884 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1885 {
1886   _cpp_buff *buff = pfile->u_buff;
1887   unsigned char *result = buff->cur;
1888
1889   if (len > (size_t) (buff->limit - result))
1890     {
1891       buff = _cpp_get_buff (pfile, len);
1892       buff->next = pfile->u_buff;
1893       pfile->u_buff = buff;
1894       result = buff->cur;
1895     }
1896
1897   buff->cur = result + len;
1898   return result;
1899 }
1900
1901 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1902    That buffer is used for growing allocations when saving macro
1903    replacement lists in a #define, and when parsing an answer to an
1904    assertion in #assert, #unassert or #if (and therefore possibly
1905    whilst expanding macros).  It therefore must not be used by any
1906    code that they might call: specifically the lexer and the guts of
1907    the macro expander.
1908
1909    All existing other uses clearly fit this restriction: storing
1910    registered pragmas during initialization.  */
1911 unsigned char *
1912 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1913 {
1914   _cpp_buff *buff = pfile->a_buff;
1915   unsigned char *result = buff->cur;
1916
1917   if (len > (size_t) (buff->limit - result))
1918     {
1919       buff = _cpp_get_buff (pfile, len);
1920       buff->next = pfile->a_buff;
1921       pfile->a_buff = buff;
1922       result = buff->cur;
1923     }
1924
1925   buff->cur = result + len;
1926   return result;
1927 }
1928
1929 /* Say which field of TOK is in use.  */
1930
1931 enum cpp_token_fld_kind
1932 cpp_token_val_index (cpp_token *tok)
1933 {
1934   switch (TOKEN_SPELL (tok))
1935     {
1936     case SPELL_IDENT:
1937       return CPP_TOKEN_FLD_NODE;
1938     case SPELL_LITERAL:
1939       return CPP_TOKEN_FLD_STR;
1940     case SPELL_NONE:
1941       if (tok->type == CPP_MACRO_ARG)
1942         return CPP_TOKEN_FLD_ARG_NO;
1943       else if (tok->type == CPP_PADDING)
1944         return CPP_TOKEN_FLD_SOURCE;
1945       else if (tok->type == CPP_PRAGMA)
1946         return CPP_TOKEN_FLD_PRAGMA;
1947       /* else fall through */
1948     default:
1949       return CPP_TOKEN_FLD_NONE;
1950     }
1951 }