contrib/grep/lib/regcomp.c

   1 /* -*- buffer-read-only: t -*- vi: set ro: */
   2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
   3 /* Extended regular expression matching and search library.
   4    Copyright (C) 2002-2011 Free Software Foundation, Inc.
   5    This file is part of the GNU C Library.
   6    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   7
   8    This program is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 3, or (at your option)
  11    any later version.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License along
  19    with this program; if not, write to the Free Software Foundation,
  20    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  21
  22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  23                                           size_t length, reg_syntax_t syntax);
  24 static void re_compile_fastmap_iter (regex_t *bufp,
  25                                      const re_dfastate_t *init_state,
  26                                      char *fastmap);
  27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (regex_t *preg);
  37 static reg_errcode_t preorder (bin_tree_t *root,
  38                                reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                void *extra);
  40 static reg_errcode_t postorder (bin_tree_t *root,
  41                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  42                                 void *extra);
  43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  46                                  bin_tree_t *node);
  47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  50 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  51 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  52                                    unsigned int constraint);
  53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  55                                          Idx node, bool root);
  56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  57 static Idx fetch_number (re_string_t *input, re_token_t *token,
  58                          reg_syntax_t syntax);
  59 static int peek_token (re_token_t *token, re_string_t *input,
  60                         reg_syntax_t syntax) internal_function;
  61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  62                           reg_syntax_t syntax, reg_errcode_t *err);
  63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  64                                   re_token_t *token, reg_syntax_t syntax,
  65                                   Idx nest, reg_errcode_t *err);
  66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  67                                  re_token_t *token, reg_syntax_t syntax,
  68                                  Idx nest, reg_errcode_t *err);
  69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  70                                      re_token_t *token, reg_syntax_t syntax,
  71                                      Idx nest, reg_errcode_t *err);
  72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  73                                   re_token_t *token, reg_syntax_t syntax,
  74                                   Idx nest, reg_errcode_t *err);
  75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  76                                  re_dfa_t *dfa, re_token_t *token,
  77                                  reg_syntax_t syntax, reg_errcode_t *err);
  78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  79                                       re_token_t *token, reg_syntax_t syntax,
  80                                       reg_errcode_t *err);
  81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  82                                             re_string_t *regexp,
  83                                             re_token_t *token, int token_len,
  84                                             re_dfa_t *dfa,
  85                                             reg_syntax_t syntax,
  86                                             bool accept_hyphen);
  87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  88                                           re_string_t *regexp,
  89                                           re_token_t *token);
  90 #ifdef RE_ENABLE_I18N
  91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  92                                         re_charset_t *mbcset,
  93                                         Idx *equiv_class_alloc,
  94                                         const unsigned char *name);
  95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  96                                       bitset_t sbcset,
  97                                       re_charset_t *mbcset,
  98                                       Idx *char_class_alloc,
  99                                       const unsigned char *class_name,
 100                                       reg_syntax_t syntax);
 101 #else  /* not RE_ENABLE_I18N */
 102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 103                                         const unsigned char *name);
 104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 105                                       bitset_t sbcset,
 106                                       const unsigned char *class_name,
 107                                       reg_syntax_t syntax);
 108 #endif /* not RE_ENABLE_I18N */
 109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 110                                        RE_TRANSLATE_TYPE trans,
 111                                        const unsigned char *class_name,
 112                                        const unsigned char *extra,
 113                                        bool non_match, reg_errcode_t *err);
 114 static bin_tree_t *create_tree (re_dfa_t *dfa,
 115                                 bin_tree_t *left, bin_tree_t *right,
 116                                 re_token_type_t type);
 117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 118                                       bin_tree_t *left, bin_tree_t *right,
 119                                       const re_token_t *token);
 120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 121 static void free_token (re_token_t *node);
 122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 124 \f
 125 /* This table gives an error message for each of the error codes listed
 126    in regex.h.  Obviously the order here has to be same as there.
 127    POSIX doesn't require that we do anything for REG_NOERROR,
 128    but why not be nice?  */
 129
 130 static const char __re_error_msgid[] =
 131   {
 132 #define REG_NOERROR_IDX 0
 133     gettext_noop ("Success")    /* REG_NOERROR */
 134     "\0"
 135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 136     gettext_noop ("No match")   /* REG_NOMATCH */
 137     "\0"
 138 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 139     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 140     "\0"
 141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 142     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 143     "\0"
 144 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 145     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 146     "\0"
 147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 148     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 149     "\0"
 150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 151     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 152     "\0"
 153 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 154     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 155     "\0"
 156 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 157     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 158     "\0"
 159 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 160     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 161     "\0"
 162 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 163     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 164     "\0"
 165 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 166     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 167     "\0"
 168 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 169     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 170     "\0"
 171 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 172     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 173     "\0"
 174 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 175     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 176     "\0"
 177 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 178     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 179     "\0"
 180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 181     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 182   };
 183
 184 static const size_t __re_error_msgid_idx[] =
 185   {
 186     REG_NOERROR_IDX,
 187     REG_NOMATCH_IDX,
 188     REG_BADPAT_IDX,
 189     REG_ECOLLATE_IDX,
 190     REG_ECTYPE_IDX,
 191     REG_EESCAPE_IDX,
 192     REG_ESUBREG_IDX,
 193     REG_EBRACK_IDX,
 194     REG_EPAREN_IDX,
 195     REG_EBRACE_IDX,
 196     REG_BADBR_IDX,
 197     REG_ERANGE_IDX,
 198     REG_ESPACE_IDX,
 199     REG_BADRPT_IDX,
 200     REG_EEND_IDX,
 201     REG_ESIZE_IDX,
 202     REG_ERPAREN_IDX
 203   };
 204 \f
 205 /* Entry points for GNU code.  */
 206
 207 /* re_compile_pattern is the GNU regular expression compiler: it
 208    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 209    Returns 0 if the pattern was valid, otherwise an error string.
 210
 211    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 212    are set in BUFP on entry.  */
 213
 214 #ifdef _LIBC
 215 const char *
 216 re_compile_pattern (pattern, length, bufp)
 217     const char *pattern;
 218     size_t length;
 219     struct re_pattern_buffer *bufp;
 220 #else /* size_t might promote */
 221 const char *
 222 re_compile_pattern (const char *pattern, size_t length,
 223                     struct re_pattern_buffer *bufp)
 224 #endif
 225 {
 226   reg_errcode_t ret;
 227
 228   /* And GNU code determines whether or not to get register information
 229      by passing null for the REGS argument to re_match, etc., not by
 230      setting no_sub, unless RE_NO_SUB is set.  */
 231   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 232
 233   /* Match anchors at newline.  */
 234   bufp->newline_anchor = 1;
 235
 236   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 237
 238   if (!ret)
 239     return NULL;
 240   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 241 }
 242 #ifdef _LIBC
 243 weak_alias (__re_compile_pattern, re_compile_pattern)
 244 #endif
 245
 246 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 247    also be assigned to arbitrarily: each pattern buffer stores its own
 248    syntax, so it can be changed between regex compilations.  */
 249 /* This has no initializer because initialized variables in Emacs
 250    become read-only after dumping.  */
 251 reg_syntax_t re_syntax_options;
 252
 253
 254 /* Specify the precise syntax of regexps for compilation.  This provides
 255    for compatibility for various utilities which historically have
 256    different, incompatible syntaxes.
 257
 258    The argument SYNTAX is a bit mask comprised of the various bits
 259    defined in regex.h.  We return the old syntax.  */
 260
 261 reg_syntax_t
 262 re_set_syntax (syntax)
 263     reg_syntax_t syntax;
 264 {
 265   reg_syntax_t ret = re_syntax_options;
 266
 267   re_syntax_options = syntax;
 268   return ret;
 269 }
 270 #ifdef _LIBC
 271 weak_alias (__re_set_syntax, re_set_syntax)
 272 #endif
 273
 274 int
 275 re_compile_fastmap (bufp)
 276     struct re_pattern_buffer *bufp;
 277 {
 278   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 279   char *fastmap = bufp->fastmap;
 280
 281   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 282   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 283   if (dfa->init_state != dfa->init_state_word)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 285   if (dfa->init_state != dfa->init_state_nl)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 287   if (dfa->init_state != dfa->init_state_begbuf)
 288     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 289   bufp->fastmap_accurate = 1;
 290   return 0;
 291 }
 292 #ifdef _LIBC
 293 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 294 #endif
 295
 296 static inline void
 297 __attribute ((always_inline))
 298 re_set_fastmap (char *fastmap, bool icase, int ch)
 299 {
 300   fastmap[ch] = 1;
 301   if (icase)
 302     fastmap[tolower (ch)] = 1;
 303 }
 304
 305 /* Helper function for re_compile_fastmap.
 306    Compile fastmap for the initial_state INIT_STATE.  */
 307
 308 static void
 309 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 310                          char *fastmap)
 311 {
 312   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 313   Idx node_cnt;
 314   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 315   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 316     {
 317       Idx node = init_state->nodes.elems[node_cnt];
 318       re_token_type_t type = dfa->nodes[node].type;
 319
 320       if (type == CHARACTER)
 321         {
 322           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 323 #ifdef RE_ENABLE_I18N
 324           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 325             {
 326               unsigned char buf[MB_LEN_MAX];
 327               unsigned char *p;
 328               wchar_t wc;
 329               mbstate_t state;
 330
 331               p = buf;
 332               *p++ = dfa->nodes[node].opr.c;
 333               while (++node < dfa->nodes_len
 334                      && dfa->nodes[node].type == CHARACTER
 335                      && dfa->nodes[node].mb_partial)
 336                 *p++ = dfa->nodes[node].opr.c;
 337               memset (&state, '\0', sizeof (state));
 338               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 339                              &state) == p - buf
 340                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 341                       != (size_t) -1))
 342                 re_set_fastmap (fastmap, false, buf[0]);
 343             }
 344 #endif
 345         }
 346       else if (type == SIMPLE_BRACKET)
 347         {
 348           int i, ch;
 349           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 350             {
 351               int j;
 352               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 353               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 354                 if (w & ((bitset_word_t) 1 << j))
 355                   re_set_fastmap (fastmap, icase, ch);
 356             }
 357         }
 358 #ifdef RE_ENABLE_I18N
 359       else if (type == COMPLEX_BRACKET)
 360         {
 361           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 362           Idx i;
 363
 364 # ifdef _LIBC
 365           /* See if we have to try all bytes which start multiple collation
 366              elements.
 367              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 368                   collation element, and don't catch 'b' since 'b' is
 369                   the only collation element which starts from 'b' (and
 370                   it is caught by SIMPLE_BRACKET).  */
 371               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 372                   && (cset->ncoll_syms || cset->nranges))
 373                 {
 374                   const int32_t *table = (const int32_t *)
 375                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 376                   for (i = 0; i < SBC_MAX; ++i)
 377                     if (table[i] < 0)
 378                       re_set_fastmap (fastmap, icase, i);
 379                 }
 380 # endif /* _LIBC */
 381
 382           /* See if we have to start the match at all multibyte characters,
 383              i.e. where we would not find an invalid sequence.  This only
 384              applies to multibyte character sets; for single byte character
 385              sets, the SIMPLE_BRACKET again suffices.  */
 386           if (dfa->mb_cur_max > 1
 387               && (cset->nchar_classes || cset->non_match || cset->nranges
 388 # ifdef _LIBC
 389                   || cset->nequiv_classes
 390 # endif /* _LIBC */
 391                  ))
 392             {
 393               unsigned char c = 0;
 394               do
 395                 {
 396                   mbstate_t mbs;
 397                   memset (&mbs, 0, sizeof (mbs));
 398                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 399                     re_set_fastmap (fastmap, false, (int) c);
 400                 }
 401               while (++c != 0);
 402             }
 403
 404           else
 405             {
 406               /* ... Else catch all bytes which can start the mbchars.  */
 407               for (i = 0; i < cset->nmbchars; ++i)
 408                 {
 409                   char buf[256];
 410                   mbstate_t state;
 411                   memset (&state, '\0', sizeof (state));
 412                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 413                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 414                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 415                     {
 416                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 417                           != (size_t) -1)
 418                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 419                     }
 420                 }
 421             }
 422         }
 423 #endif /* RE_ENABLE_I18N */
 424       else if (type == OP_PERIOD
 425 #ifdef RE_ENABLE_I18N
 426                || type == OP_UTF8_PERIOD
 427 #endif /* RE_ENABLE_I18N */
 428                || type == END_OF_RE)
 429         {
 430           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 431           if (type == END_OF_RE)
 432             bufp->can_be_null = 1;
 433           return;
 434         }
 435     }
 436 }
 437 \f
 438 /* Entry point for POSIX code.  */
 439 /* regcomp takes a regular expression as a string and compiles it.
 440
 441    PREG is a regex_t *.  We do not expect any fields to be initialized,
 442    since POSIX says we shouldn't.  Thus, we set
 443
 444      `buffer' to the compiled pattern;
 445      `used' to the length of the compiled pattern;
 446      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 447        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 448        RE_SYNTAX_POSIX_BASIC;
 449      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 450      `fastmap' to an allocated space for the fastmap;
 451      `fastmap_accurate' to zero;
 452      `re_nsub' to the number of subexpressions in PATTERN.
 453
 454    PATTERN is the address of the pattern string.
 455
 456    CFLAGS is a series of bits which affect compilation.
 457
 458      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 459      use POSIX basic syntax.
 460
 461      If REG_NEWLINE is set, then . and [^...] don't match newline.
 462      Also, regexec will try a match beginning after every newline.
 463
 464      If REG_ICASE is set, then we considers upper- and lowercase
 465      versions of letters to be equivalent when matching.
 466
 467      If REG_NOSUB is set, then when PREG is passed to regexec, that
 468      routine will report only success or failure, and nothing about the
 469      registers.
 470
 471    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 472    the return codes and their meanings.)  */
 473
 474 int
 475 regcomp (preg, pattern, cflags)
 476     regex_t *_Restrict_ preg;
 477     const char *_Restrict_ pattern;
 478     int cflags;
 479 {
 480   reg_errcode_t ret;
 481   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 482                          : RE_SYNTAX_POSIX_BASIC);
 483
 484   preg->buffer = NULL;
 485   preg->allocated = 0;
 486   preg->used = 0;
 487
 488   /* Try to allocate space for the fastmap.  */
 489   preg->fastmap = re_malloc (char, SBC_MAX);
 490   if (BE (preg->fastmap == NULL, 0))
 491     return REG_ESPACE;
 492
 493   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 494
 495   /* If REG_NEWLINE is set, newlines are treated differently.  */
 496   if (cflags & REG_NEWLINE)
 497     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 498       syntax &= ~RE_DOT_NEWLINE;
 499       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 500       /* It also changes the matching behavior.  */
 501       preg->newline_anchor = 1;
 502     }
 503   else
 504     preg->newline_anchor = 0;
 505   preg->no_sub = !!(cflags & REG_NOSUB);
 506   preg->translate = NULL;
 507
 508   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 509
 510   /* POSIX doesn't distinguish between an unmatched open-group and an
 511      unmatched close-group: both are REG_EPAREN.  */
 512   if (ret == REG_ERPAREN)
 513     ret = REG_EPAREN;
 514
 515   /* We have already checked preg->fastmap != NULL.  */
 516   if (BE (ret == REG_NOERROR, 1))
 517     /* Compute the fastmap now, since regexec cannot modify the pattern
 518        buffer.  This function never fails in this implementation.  */
 519     (void) re_compile_fastmap (preg);
 520   else
 521     {
 522       /* Some error occurred while compiling the expression.  */
 523       re_free (preg->fastmap);
 524       preg->fastmap = NULL;
 525     }
 526
 527   return (int) ret;
 528 }
 529 #ifdef _LIBC
 530 weak_alias (__regcomp, regcomp)
 531 #endif
 532
 533 /* Returns a message corresponding to an error code, ERRCODE, returned
 534    from either regcomp or regexec.   We don't use PREG here.  */
 535
 536 #ifdef _LIBC
 537 size_t
 538 regerror (errcode, preg, errbuf, errbuf_size)
 539     int errcode;
 540     const regex_t *_Restrict_ preg;
 541     char *_Restrict_ errbuf;
 542     size_t errbuf_size;
 543 #else /* size_t might promote */
 544 size_t
 545 regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_,
 546           char *_Restrict_ errbuf, size_t errbuf_size)
 547 #endif
 548 {
 549   const char *msg;
 550   size_t msg_size;
 551
 552   if (BE (errcode < 0
 553           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 554                                / sizeof (__re_error_msgid_idx[0])), 0))
 555     /* Only error codes returned by the rest of the code should be passed
 556        to this routine.  If we are given anything else, or if other regex
 557        code generates an invalid error code, then the program has a bug.
 558        Dump core so we can fix it.  */
 559     abort ();
 560
 561   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 562
 563   msg_size = strlen (msg) + 1; /* Includes the null.  */
 564
 565   if (BE (errbuf_size != 0, 1))
 566     {
 567       size_t cpy_size = msg_size;
 568       if (BE (msg_size > errbuf_size, 0))
 569         {
 570           cpy_size = errbuf_size - 1;
 571           errbuf[cpy_size] = '\0';
 572         }
 573       memcpy (errbuf, msg, cpy_size);
 574     }
 575
 576   return msg_size;
 577 }
 578 #ifdef _LIBC
 579 weak_alias (__regerror, regerror)
 580 #endif
 581
 582
 583 #ifdef RE_ENABLE_I18N
 584 /* This static array is used for the map to single-byte characters when
 585    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 586    it the same all the time.  UTF-8 is the preferred encoding so this is
 587    a worthwhile optimization.  */
 588 static const bitset_t utf8_sb_map =
 589 {
 590   /* Set the first 128 bits.  */
 591 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 592 #  error "bitset_word_t is narrower than 32 bits"
 593 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 594   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 595 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 596   BITSET_WORD_MAX, BITSET_WORD_MAX,
 597 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 598   BITSET_WORD_MAX,
 599 # endif
 600   (BITSET_WORD_MAX
 601    >> (SBC_MAX % BITSET_WORD_BITS == 0
 602        ? 0
 603        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 604 };
 605 #endif
 606
 607
 608 static void
 609 free_dfa_content (re_dfa_t *dfa)
 610 {
 611   Idx i, j;
 612
 613   if (dfa->nodes)
 614     for (i = 0; i < dfa->nodes_len; ++i)
 615       free_token (dfa->nodes + i);
 616   re_free (dfa->nexts);
 617   for (i = 0; i < dfa->nodes_len; ++i)
 618     {
 619       if (dfa->eclosures != NULL)
 620         re_node_set_free (dfa->eclosures + i);
 621       if (dfa->inveclosures != NULL)
 622         re_node_set_free (dfa->inveclosures + i);
 623       if (dfa->edests != NULL)
 624         re_node_set_free (dfa->edests + i);
 625     }
 626   re_free (dfa->edests);
 627   re_free (dfa->eclosures);
 628   re_free (dfa->inveclosures);
 629   re_free (dfa->nodes);
 630
 631   if (dfa->state_table)
 632     for (i = 0; i <= dfa->state_hash_mask; ++i)
 633       {
 634         struct re_state_table_entry *entry = dfa->state_table + i;
 635         for (j = 0; j < entry->num; ++j)
 636           {
 637             re_dfastate_t *state = entry->array[j];
 638             free_state (state);
 639           }
 640         re_free (entry->array);
 641       }
 642   re_free (dfa->state_table);
 643 #ifdef RE_ENABLE_I18N
 644   if (dfa->sb_char != utf8_sb_map)
 645     re_free (dfa->sb_char);
 646 #endif
 647   re_free (dfa->subexp_map);
 648 #ifdef DEBUG
 649   re_free (dfa->re_str);
 650 #endif
 651
 652   re_free (dfa);
 653 }
 654
 655
 656 /* Free dynamically allocated space used by PREG.  */
 657
 658 void
 659 regfree (preg)
 660     regex_t *preg;
 661 {
 662   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 663   if (BE (dfa != NULL, 1))
 664     free_dfa_content (dfa);
 665   preg->buffer = NULL;
 666   preg->allocated = 0;
 667
 668   re_free (preg->fastmap);
 669   preg->fastmap = NULL;
 670
 671   re_free (preg->translate);
 672   preg->translate = NULL;
 673 }
 674 #ifdef _LIBC
 675 weak_alias (__regfree, regfree)
 676 #endif
 677 \f
 678 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 679    them unless specifically requested.  */
 680
 681 #if defined _REGEX_RE_COMP || defined _LIBC
 682
 683 /* BSD has one and only one pattern buffer.  */
 684 static struct re_pattern_buffer re_comp_buf;
 685
 686 char *
 687 # ifdef _LIBC
 688 /* Make these definitions weak in libc, so POSIX programs can redefine
 689    these names if they don't use our functions, and still use
 690    regcomp/regexec above without link errors.  */
 691 weak_function
 692 # endif
 693 re_comp (s)
 694      const char *s;
 695 {
 696   reg_errcode_t ret;
 697   char *fastmap;
 698
 699   if (!s)
 700     {
 701       if (!re_comp_buf.buffer)
 702         return gettext ("No previous regular expression");
 703       return 0;
 704     }
 705
 706   if (re_comp_buf.buffer)
 707     {
 708       fastmap = re_comp_buf.fastmap;
 709       re_comp_buf.fastmap = NULL;
 710       __regfree (&re_comp_buf);
 711       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 712       re_comp_buf.fastmap = fastmap;
 713     }
 714
 715   if (re_comp_buf.fastmap == NULL)
 716     {
 717       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 718       if (re_comp_buf.fastmap == NULL)
 719         return (char *) gettext (__re_error_msgid
 720                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 721     }
 722
 723   /* Since `re_exec' always passes NULL for the `regs' argument, we
 724      don't need to initialize the pattern buffer fields which affect it.  */
 725
 726   /* Match anchors at newlines.  */
 727   re_comp_buf.newline_anchor = 1;
 728
 729   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 730
 731   if (!ret)
 732     return NULL;
 733
 734   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 735   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 736 }
 737
 738 #ifdef _LIBC
 739 libc_freeres_fn (free_mem)
 740 {
 741   __regfree (&re_comp_buf);
 742 }
 743 #endif
 744
 745 #endif /* _REGEX_RE_COMP */
 746 \f
 747 /* Internal entry point.
 748    Compile the regular expression PATTERN, whose length is LENGTH.
 749    SYNTAX indicate regular expression's syntax.  */
 750
 751 static reg_errcode_t
 752 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 753                      reg_syntax_t syntax)
 754 {
 755   reg_errcode_t err = REG_NOERROR;
 756   re_dfa_t *dfa;
 757   re_string_t regexp;
 758
 759   /* Initialize the pattern buffer.  */
 760   preg->fastmap_accurate = 0;
 761   preg->syntax = syntax;
 762   preg->not_bol = preg->not_eol = 0;
 763   preg->used = 0;
 764   preg->re_nsub = 0;
 765   preg->can_be_null = 0;
 766   preg->regs_allocated = REGS_UNALLOCATED;
 767
 768   /* Initialize the dfa.  */
 769   dfa = (re_dfa_t *) preg->buffer;
 770   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 771     {
 772       /* If zero allocated, but buffer is non-null, try to realloc
 773          enough space.  This loses if buffer's address is bogus, but
 774          that is the user's responsibility.  If ->buffer is NULL this
 775          is a simple allocation.  */
 776       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 777       if (dfa == NULL)
 778         return REG_ESPACE;
 779       preg->allocated = sizeof (re_dfa_t);
 780       preg->buffer = (unsigned char *) dfa;
 781     }
 782   preg->used = sizeof (re_dfa_t);
 783
 784   err = init_dfa (dfa, length);
 785   if (BE (err != REG_NOERROR, 0))
 786     {
 787       free_dfa_content (dfa);
 788       preg->buffer = NULL;
 789       preg->allocated = 0;
 790       return err;
 791     }
 792 #ifdef DEBUG
 793   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 794   dfa->re_str = re_malloc (char, length + 1);
 795   strncpy (dfa->re_str, pattern, length + 1);
 796 #endif
 797
 798   __libc_lock_init (dfa->lock);
 799
 800   err = re_string_construct (&regexp, pattern, length, preg->translate,
 801                              (syntax & RE_ICASE) != 0, dfa);
 802   if (BE (err != REG_NOERROR, 0))
 803     {
 804     re_compile_internal_free_return:
 805       free_workarea_compile (preg);
 806       re_string_destruct (&regexp);
 807       free_dfa_content (dfa);
 808       preg->buffer = NULL;
 809       preg->allocated = 0;
 810       return err;
 811     }
 812
 813   /* Parse the regular expression, and build a structure tree.  */
 814   preg->re_nsub = 0;
 815   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 816   if (BE (dfa->str_tree == NULL, 0))
 817     goto re_compile_internal_free_return;
 818
 819   /* Analyze the tree and create the nfa.  */
 820   err = analyze (preg);
 821   if (BE (err != REG_NOERROR, 0))
 822     goto re_compile_internal_free_return;
 823
 824 #ifdef RE_ENABLE_I18N
 825   /* If possible, do searching in single byte encoding to speed things up.  */
 826   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 827     optimize_utf8 (dfa);
 828 #endif
 829
 830   /* Then create the initial state of the dfa.  */
 831   err = create_initial_state (dfa);
 832
 833   /* Release work areas.  */
 834   free_workarea_compile (preg);
 835   re_string_destruct (&regexp);
 836
 837   if (BE (err != REG_NOERROR, 0))
 838     {
 839       free_dfa_content (dfa);
 840       preg->buffer = NULL;
 841       preg->allocated = 0;
 842     }
 843
 844   return err;
 845 }
 846
 847 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 848    as the initial length of some arrays.  */
 849
 850 static reg_errcode_t
 851 init_dfa (re_dfa_t *dfa, size_t pat_len)
 852 {
 853   __re_size_t table_size;
 854 #ifndef _LIBC
 855   char *codeset_name;
 856 #endif
 857 #ifdef RE_ENABLE_I18N
 858   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 859 #else
 860   size_t max_i18n_object_size = 0;
 861 #endif
 862   size_t max_object_size =
 863     MAX (sizeof (struct re_state_table_entry),
 864          MAX (sizeof (re_token_t),
 865               MAX (sizeof (re_node_set),
 866                    MAX (sizeof (regmatch_t),
 867                         max_i18n_object_size))));
 868
 869   memset (dfa, '\0', sizeof (re_dfa_t));
 870
 871   /* Force allocation of str_tree_storage the first time.  */
 872   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 873
 874   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 875      calculation below, and for similar doubling calculations
 876      elsewhere.  And it's <= rather than <, because some of the
 877      doubling calculations add 1 afterwards.  */
 878   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 879     return REG_ESPACE;
 880
 881   dfa->nodes_alloc = pat_len + 1;
 882   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 883
 884   /*  table_size = 2 ^ ceil(log pat_len) */
 885   for (table_size = 1; ; table_size <<= 1)
 886     if (table_size > pat_len)
 887       break;
 888
 889   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 890   dfa->state_hash_mask = table_size - 1;
 891
 892   dfa->mb_cur_max = MB_CUR_MAX;
 893 #ifdef _LIBC
 894   if (dfa->mb_cur_max == 6
 895       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 896     dfa->is_utf8 = 1;
 897   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 898                        != 0);
 899 #else
 900   codeset_name = nl_langinfo (CODESET);
 901   if (strcasecmp (codeset_name, "UTF-8") == 0
 902       || strcasecmp (codeset_name, "UTF8") == 0)
 903     dfa->is_utf8 = 1;
 904
 905   /* We check exhaustively in the loop below if this charset is a
 906      superset of ASCII.  */
 907   dfa->map_notascii = 0;
 908 #endif
 909
 910 #ifdef RE_ENABLE_I18N
 911   if (dfa->mb_cur_max > 1)
 912     {
 913       if (dfa->is_utf8)
 914         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 915       else
 916         {
 917           int i, j, ch;
 918
 919           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 920           if (BE (dfa->sb_char == NULL, 0))
 921             return REG_ESPACE;
 922
 923           /* Set the bits corresponding to single byte chars.  */
 924           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 925             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 926               {
 927                 wint_t wch = __btowc (ch);
 928                 if (wch != WEOF)
 929                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 930 # ifndef _LIBC
 931                 if (isascii (ch) && wch != ch)
 932                   dfa->map_notascii = 1;
 933 # endif
 934               }
 935         }
 936     }
 937 #endif
 938
 939   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 940     return REG_ESPACE;
 941   return REG_NOERROR;
 942 }
 943
 944 /* Initialize WORD_CHAR table, which indicate which character is
 945    "word".  In this case "word" means that it is the word construction
 946    character used by some operators like "\<", "\>", etc.  */
 947
 948 static void
 949 internal_function
 950 init_word_char (re_dfa_t *dfa)
 951 {
 952   int i, j, ch;
 953   dfa->word_ops_used = 1;
 954   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 955     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 956       if (isalnum (ch) || ch == '_')
 957         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 958 }
 959
 960 /* Free the work area which are only used while compiling.  */
 961
 962 static void
 963 free_workarea_compile (regex_t *preg)
 964 {
 965   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 966   bin_tree_storage_t *storage, *next;
 967   for (storage = dfa->str_tree_storage; storage; storage = next)
 968     {
 969       next = storage->next;
 970       re_free (storage);
 971     }
 972   dfa->str_tree_storage = NULL;
 973   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 974   dfa->str_tree = NULL;
 975   re_free (dfa->org_indices);
 976   dfa->org_indices = NULL;
 977 }
 978
 979 /* Create initial states for all contexts.  */
 980
 981 static reg_errcode_t
 982 create_initial_state (re_dfa_t *dfa)
 983 {
 984   Idx first, i;
 985   reg_errcode_t err;
 986   re_node_set init_nodes;
 987
 988   /* Initial states have the epsilon closure of the node which is
 989      the first node of the regular expression.  */
 990   first = dfa->str_tree->first->node_idx;
 991   dfa->init_node = first;
 992   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 993   if (BE (err != REG_NOERROR, 0))
 994     return err;
 995
 996   /* The back-references which are in initial states can epsilon transit,
 997      since in this case all of the subexpressions can be null.
 998      Then we add epsilon closures of the nodes which are the next nodes of
 999      the back-references.  */
1000   if (dfa->nbackref > 0)
1001     for (i = 0; i < init_nodes.nelem; ++i)
1002       {
1003         Idx node_idx = init_nodes.elems[i];
1004         re_token_type_t type = dfa->nodes[node_idx].type;
1005
1006         Idx clexp_idx;
1007         if (type != OP_BACK_REF)
1008           continue;
1009         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1010           {
1011             re_token_t *clexp_node;
1012             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1013             if (clexp_node->type == OP_CLOSE_SUBEXP
1014                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1015               break;
1016           }
1017         if (clexp_idx == init_nodes.nelem)
1018           continue;
1019
1020         if (type == OP_BACK_REF)
1021           {
1022             Idx dest_idx = dfa->edests[node_idx].elems[0];
1023             if (!re_node_set_contains (&init_nodes, dest_idx))
1024               {
1025                 reg_errcode_t merge_err
1026                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1027                 if (merge_err != REG_NOERROR)
1028                   return merge_err;
1029                 i = 0;
1030               }
1031           }
1032       }
1033
1034   /* It must be the first time to invoke acquire_state.  */
1035   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1036   /* We don't check ERR here, since the initial state must not be NULL.  */
1037   if (BE (dfa->init_state == NULL, 0))
1038     return err;
1039   if (dfa->init_state->has_constraint)
1040     {
1041       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1042                                                        CONTEXT_WORD);
1043       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1044                                                      CONTEXT_NEWLINE);
1045       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1046                                                          &init_nodes,
1047                                                          CONTEXT_NEWLINE
1048                                                          | CONTEXT_BEGBUF);
1049       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1050               || dfa->init_state_begbuf == NULL, 0))
1051         return err;
1052     }
1053   else
1054     dfa->init_state_word = dfa->init_state_nl
1055       = dfa->init_state_begbuf = dfa->init_state;
1056
1057   re_node_set_free (&init_nodes);
1058   return REG_NOERROR;
1059 }
1060 \f
1061 #ifdef RE_ENABLE_I18N
1062 /* If it is possible to do searching in single byte encoding instead of UTF-8
1063    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1064    DFA nodes where needed.  */
1065
1066 static void
1067 optimize_utf8 (re_dfa_t *dfa)
1068 {
1069   Idx node;
1070   int i;
1071   bool mb_chars = false;
1072   bool has_period = false;
1073
1074   for (node = 0; node < dfa->nodes_len; ++node)
1075     switch (dfa->nodes[node].type)
1076       {
1077       case CHARACTER:
1078         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1079           mb_chars = true;
1080         break;
1081       case ANCHOR:
1082         switch (dfa->nodes[node].opr.ctx_type)
1083           {
1084           case LINE_FIRST:
1085           case LINE_LAST:
1086           case BUF_FIRST:
1087           case BUF_LAST:
1088             break;
1089           default:
1090             /* Word anchors etc. cannot be handled.  It's okay to test
1091                opr.ctx_type since constraints (for all DFA nodes) are
1092                created by ORing one or more opr.ctx_type values.  */
1093             return;
1094           }
1095         break;
1096       case OP_PERIOD:
1097         has_period = true;
1098         break;
1099       case OP_BACK_REF:
1100       case OP_ALT:
1101       case END_OF_RE:
1102       case OP_DUP_ASTERISK:
1103       case OP_OPEN_SUBEXP:
1104       case OP_CLOSE_SUBEXP:
1105         break;
1106       case COMPLEX_BRACKET:
1107         return;
1108       case SIMPLE_BRACKET:
1109         /* Just double check.  */
1110         {
1111           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1112                         ? 0
1113                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1114           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1115             {
1116               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1117                 return;
1118               rshift = 0;
1119             }
1120         }
1121         break;
1122       default:
1123         abort ();
1124       }
1125
1126   if (mb_chars || has_period)
1127     for (node = 0; node < dfa->nodes_len; ++node)
1128       {
1129         if (dfa->nodes[node].type == CHARACTER
1130             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1131           dfa->nodes[node].mb_partial = 0;
1132         else if (dfa->nodes[node].type == OP_PERIOD)
1133           dfa->nodes[node].type = OP_UTF8_PERIOD;
1134       }
1135
1136   /* The search can be in single byte locale.  */
1137   dfa->mb_cur_max = 1;
1138   dfa->is_utf8 = 0;
1139   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1140 }
1141 #endif
1142 \f
1143 /* Analyze the structure tree, and calculate "first", "next", "edest",
1144    "eclosure", and "inveclosure".  */
1145
1146 static reg_errcode_t
1147 analyze (regex_t *preg)
1148 {
1149   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1150   reg_errcode_t ret;
1151
1152   /* Allocate arrays.  */
1153   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1154   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1155   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1156   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1157   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1158           || dfa->eclosures == NULL, 0))
1159     return REG_ESPACE;
1160
1161   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1162   if (dfa->subexp_map != NULL)
1163     {
1164       Idx i;
1165       for (i = 0; i < preg->re_nsub; i++)
1166         dfa->subexp_map[i] = i;
1167       preorder (dfa->str_tree, optimize_subexps, dfa);
1168       for (i = 0; i < preg->re_nsub; i++)
1169         if (dfa->subexp_map[i] != i)
1170           break;
1171       if (i == preg->re_nsub)
1172         {
1173           free (dfa->subexp_map);
1174           dfa->subexp_map = NULL;
1175         }
1176     }
1177
1178   ret = postorder (dfa->str_tree, lower_subexps, preg);
1179   if (BE (ret != REG_NOERROR, 0))
1180     return ret;
1181   ret = postorder (dfa->str_tree, calc_first, dfa);
1182   if (BE (ret != REG_NOERROR, 0))
1183     return ret;
1184   preorder (dfa->str_tree, calc_next, dfa);
1185   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1186   if (BE (ret != REG_NOERROR, 0))
1187     return ret;
1188   ret = calc_eclosure (dfa);
1189   if (BE (ret != REG_NOERROR, 0))
1190     return ret;
1191
1192   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1193      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1194   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1195       || dfa->nbackref)
1196     {
1197       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1198       if (BE (dfa->inveclosures == NULL, 0))
1199         return REG_ESPACE;
1200       ret = calc_inveclosure (dfa);
1201     }
1202
1203   return ret;
1204 }
1205
1206 /* Our parse trees are very unbalanced, so we cannot use a stack to
1207    implement parse tree visits.  Instead, we use parent pointers and
1208    some hairy code in these two functions.  */
1209 static reg_errcode_t
1210 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1211            void *extra)
1212 {
1213   bin_tree_t *node, *prev;
1214
1215   for (node = root; ; )
1216     {
1217       /* Descend down the tree, preferably to the left (or to the right
1218          if that's the only child).  */
1219       while (node->left || node->right)
1220         if (node->left)
1221           node = node->left;
1222         else
1223           node = node->right;
1224
1225       do
1226         {
1227           reg_errcode_t err = fn (extra, node);
1228           if (BE (err != REG_NOERROR, 0))
1229             return err;
1230           if (node->parent == NULL)
1231             return REG_NOERROR;
1232           prev = node;
1233           node = node->parent;
1234         }
1235       /* Go up while we have a node that is reached from the right.  */
1236       while (node->right == prev || node->right == NULL);
1237       node = node->right;
1238     }
1239 }
1240
1241 static reg_errcode_t
1242 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1243           void *extra)
1244 {
1245   bin_tree_t *node;
1246
1247   for (node = root; ; )
1248     {
1249       reg_errcode_t err = fn (extra, node);
1250       if (BE (err != REG_NOERROR, 0))
1251         return err;
1252
1253       /* Go to the left node, or up and to the right.  */
1254       if (node->left)
1255         node = node->left;
1256       else
1257         {
1258           bin_tree_t *prev = NULL;
1259           while (node->right == prev || node->right == NULL)
1260             {
1261               prev = node;
1262               node = node->parent;
1263               if (!node)
1264                 return REG_NOERROR;
1265             }
1266           node = node->right;
1267         }
1268     }
1269 }
1270
1271 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1272    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1273    backreferences as well.  Requires a preorder visit.  */
1274 static reg_errcode_t
1275 optimize_subexps (void *extra, bin_tree_t *node)
1276 {
1277   re_dfa_t *dfa = (re_dfa_t *) extra;
1278
1279   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1280     {
1281       int idx = node->token.opr.idx;
1282       node->token.opr.idx = dfa->subexp_map[idx];
1283       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1284     }
1285
1286   else if (node->token.type == SUBEXP
1287            && node->left && node->left->token.type == SUBEXP)
1288     {
1289       Idx other_idx = node->left->token.opr.idx;
1290
1291       node->left = node->left->left;
1292       if (node->left)
1293         node->left->parent = node;
1294
1295       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1296       if (other_idx < BITSET_WORD_BITS)
1297         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1298     }
1299
1300   return REG_NOERROR;
1301 }
1302
1303 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1304    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1305 static reg_errcode_t
1306 lower_subexps (void *extra, bin_tree_t *node)
1307 {
1308   regex_t *preg = (regex_t *) extra;
1309   reg_errcode_t err = REG_NOERROR;
1310
1311   if (node->left && node->left->token.type == SUBEXP)
1312     {
1313       node->left = lower_subexp (&err, preg, node->left);
1314       if (node->left)
1315         node->left->parent = node;
1316     }
1317   if (node->right && node->right->token.type == SUBEXP)
1318     {
1319       node->right = lower_subexp (&err, preg, node->right);
1320       if (node->right)
1321         node->right->parent = node;
1322     }
1323
1324   return err;
1325 }
1326
1327 static bin_tree_t *
1328 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1329 {
1330   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1331   bin_tree_t *body = node->left;
1332   bin_tree_t *op, *cls, *tree1, *tree;
1333
1334   if (preg->no_sub
1335       /* We do not optimize empty subexpressions, because otherwise we may
1336          have bad CONCAT nodes with NULL children.  This is obviously not
1337          very common, so we do not lose much.  An example that triggers
1338          this case is the sed "script" /\(\)/x.  */
1339       && node->left != NULL
1340       && (node->token.opr.idx >= BITSET_WORD_BITS
1341           || !(dfa->used_bkref_map
1342                & ((bitset_word_t) 1 << node->token.opr.idx))))
1343     return node->left;
1344
1345   /* Convert the SUBEXP node to the concatenation of an
1346      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1347   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1348   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1349   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1350   tree = create_tree (dfa, op, tree1, CONCAT);
1351   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1352     {
1353       *err = REG_ESPACE;
1354       return NULL;
1355     }
1356
1357   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1358   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1359   return tree;
1360 }
1361
1362 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1363    nodes.  Requires a postorder visit.  */
1364 static reg_errcode_t
1365 calc_first (void *extra, bin_tree_t *node)
1366 {
1367   re_dfa_t *dfa = (re_dfa_t *) extra;
1368   if (node->token.type == CONCAT)
1369     {
1370       node->first = node->left->first;
1371       node->node_idx = node->left->node_idx;
1372     }
1373   else
1374     {
1375       node->first = node;
1376       node->node_idx = re_dfa_add_node (dfa, node->token);
1377       if (BE (node->node_idx == REG_MISSING, 0))
1378         return REG_ESPACE;
1379       if (node->token.type == ANCHOR)
1380         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1381     }
1382   return REG_NOERROR;
1383 }
1384
1385 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1386 static reg_errcode_t
1387 calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
1388 {
1389   switch (node->token.type)
1390     {
1391     case OP_DUP_ASTERISK:
1392       node->left->next = node;
1393       break;
1394     case CONCAT:
1395       node->left->next = node->right->first;
1396       node->right->next = node->next;
1397       break;
1398     default:
1399       if (node->left)
1400         node->left->next = node->next;
1401       if (node->right)
1402         node->right->next = node->next;
1403       break;
1404     }
1405   return REG_NOERROR;
1406 }
1407
1408 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1409 static reg_errcode_t
1410 link_nfa_nodes (void *extra, bin_tree_t *node)
1411 {
1412   re_dfa_t *dfa = (re_dfa_t *) extra;
1413   Idx idx = node->node_idx;
1414   reg_errcode_t err = REG_NOERROR;
1415
1416   switch (node->token.type)
1417     {
1418     case CONCAT:
1419       break;
1420
1421     case END_OF_RE:
1422       assert (node->next == NULL);
1423       break;
1424
1425     case OP_DUP_ASTERISK:
1426     case OP_ALT:
1427       {
1428         Idx left, right;
1429         dfa->has_plural_match = 1;
1430         if (node->left != NULL)
1431           left = node->left->first->node_idx;
1432         else
1433           left = node->next->node_idx;
1434         if (node->right != NULL)
1435           right = node->right->first->node_idx;
1436         else
1437           right = node->next->node_idx;
1438         assert (REG_VALID_INDEX (left));
1439         assert (REG_VALID_INDEX (right));
1440         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1441       }
1442       break;
1443
1444     case ANCHOR:
1445     case OP_OPEN_SUBEXP:
1446     case OP_CLOSE_SUBEXP:
1447       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1448       break;
1449
1450     case OP_BACK_REF:
1451       dfa->nexts[idx] = node->next->node_idx;
1452       if (node->token.type == OP_BACK_REF)
1453         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1454       break;
1455
1456     default:
1457       assert (!IS_EPSILON_NODE (node->token.type));
1458       dfa->nexts[idx] = node->next->node_idx;
1459       break;
1460     }
1461
1462   return err;
1463 }
1464
1465 /* Duplicate the epsilon closure of the node ROOT_NODE.
1466    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1467    to their own constraint.  */
1468
1469 static reg_errcode_t
1470 internal_function
1471 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1472                         Idx root_node, unsigned int init_constraint)
1473 {
1474   Idx org_node, clone_node;
1475   bool ok;
1476   unsigned int constraint = init_constraint;
1477   for (org_node = top_org_node, clone_node = top_clone_node;;)
1478     {
1479       Idx org_dest, clone_dest;
1480       if (dfa->nodes[org_node].type == OP_BACK_REF)
1481         {
1482           /* If the back reference epsilon-transit, its destination must
1483              also have the constraint.  Then duplicate the epsilon closure
1484              of the destination of the back reference, and store it in
1485              edests of the back reference.  */
1486           org_dest = dfa->nexts[org_node];
1487           re_node_set_empty (dfa->edests + clone_node);
1488           clone_dest = duplicate_node (dfa, org_dest, constraint);
1489           if (BE (clone_dest == REG_MISSING, 0))
1490             return REG_ESPACE;
1491           dfa->nexts[clone_node] = dfa->nexts[org_node];
1492           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1493           if (BE (! ok, 0))
1494             return REG_ESPACE;
1495         }
1496       else if (dfa->edests[org_node].nelem == 0)
1497         {
1498           /* In case of the node can't epsilon-transit, don't duplicate the
1499              destination and store the original destination as the
1500              destination of the node.  */
1501           dfa->nexts[clone_node] = dfa->nexts[org_node];
1502           break;
1503         }
1504       else if (dfa->edests[org_node].nelem == 1)
1505         {
1506           /* In case of the node can epsilon-transit, and it has only one
1507              destination.  */
1508           org_dest = dfa->edests[org_node].elems[0];
1509           re_node_set_empty (dfa->edests + clone_node);
1510           /* If the node is root_node itself, it means the epsilon closure
1511              has a loop.  Then tie it to the destination of the root_node.  */
1512           if (org_node == root_node && clone_node != org_node)
1513             {
1514               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1515               if (BE (! ok, 0))
1516                 return REG_ESPACE;
1517               break;
1518             }
1519           /* In case the node has another constraint, append it.  */
1520           constraint |= dfa->nodes[org_node].constraint;
1521           clone_dest = duplicate_node (dfa, org_dest, constraint);
1522           if (BE (clone_dest == REG_MISSING, 0))
1523             return REG_ESPACE;
1524           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1525           if (BE (! ok, 0))
1526             return REG_ESPACE;
1527         }
1528       else /* dfa->edests[org_node].nelem == 2 */
1529         {
1530           /* In case of the node can epsilon-transit, and it has two
1531              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1532           org_dest = dfa->edests[org_node].elems[0];
1533           re_node_set_empty (dfa->edests + clone_node);
1534           /* Search for a duplicated node which satisfies the constraint.  */
1535           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1536           if (clone_dest == REG_MISSING)
1537             {
1538               /* There is no such duplicated node, create a new one.  */
1539               reg_errcode_t err;
1540               clone_dest = duplicate_node (dfa, org_dest, constraint);
1541               if (BE (clone_dest == REG_MISSING, 0))
1542                 return REG_ESPACE;
1543               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544               if (BE (! ok, 0))
1545                 return REG_ESPACE;
1546               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547                                             root_node, constraint);
1548               if (BE (err != REG_NOERROR, 0))
1549                 return err;
1550             }
1551           else
1552             {
1553               /* There is a duplicated node which satisfies the constraint,
1554                  use it to avoid infinite loop.  */
1555               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556               if (BE (! ok, 0))
1557                 return REG_ESPACE;
1558             }
1559
1560           org_dest = dfa->edests[org_node].elems[1];
1561           clone_dest = duplicate_node (dfa, org_dest, constraint);
1562           if (BE (clone_dest == REG_MISSING, 0))
1563             return REG_ESPACE;
1564           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565           if (BE (! ok, 0))
1566             return REG_ESPACE;
1567         }
1568       org_node = org_dest;
1569       clone_node = clone_dest;
1570     }
1571   return REG_NOERROR;
1572 }
1573
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575    satisfies the constraint CONSTRAINT.  */
1576
1577 static Idx
1578 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1579                         unsigned int constraint)
1580 {
1581   Idx idx;
1582   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1583     {
1584       if (org_node == dfa->org_indices[idx]
1585           && constraint == dfa->nodes[idx].constraint)
1586         return idx; /* Found.  */
1587     }
1588   return REG_MISSING; /* Not found.  */
1589 }
1590
1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1592    Return the index of the new node, or REG_MISSING if insufficient storage is
1593    available.  */
1594
1595 static Idx
1596 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1597 {
1598   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1599   if (BE (dup_idx != REG_MISSING, 1))
1600     {
1601       dfa->nodes[dup_idx].constraint = constraint;
1602       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1603       dfa->nodes[dup_idx].duplicated = 1;
1604
1605       /* Store the index of the original node.  */
1606       dfa->org_indices[dup_idx] = org_idx;
1607     }
1608   return dup_idx;
1609 }
1610
1611 static reg_errcode_t
1612 calc_inveclosure (re_dfa_t *dfa)
1613 {
1614   Idx src, idx;
1615   bool ok;
1616   for (idx = 0; idx < dfa->nodes_len; ++idx)
1617     re_node_set_init_empty (dfa->inveclosures + idx);
1618
1619   for (src = 0; src < dfa->nodes_len; ++src)
1620     {
1621       Idx *elems = dfa->eclosures[src].elems;
1622       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1623         {
1624           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1625           if (BE (! ok, 0))
1626             return REG_ESPACE;
1627         }
1628     }
1629
1630   return REG_NOERROR;
1631 }
1632
1633 /* Calculate "eclosure" for all the node in DFA.  */
1634
1635 static reg_errcode_t
1636 calc_eclosure (re_dfa_t *dfa)
1637 {
1638   Idx node_idx;
1639   bool incomplete;
1640 #ifdef DEBUG
1641   assert (dfa->nodes_len > 0);
1642 #endif
1643   incomplete = false;
1644   /* For each nodes, calculate epsilon closure.  */
1645   for (node_idx = 0; ; ++node_idx)
1646     {
1647       reg_errcode_t err;
1648       re_node_set eclosure_elem;
1649       if (node_idx == dfa->nodes_len)
1650         {
1651           if (!incomplete)
1652             break;
1653           incomplete = false;
1654           node_idx = 0;
1655         }
1656
1657 #ifdef DEBUG
1658       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1659 #endif
1660
1661       /* If we have already calculated, skip it.  */
1662       if (dfa->eclosures[node_idx].nelem != 0)
1663         continue;
1664       /* Calculate epsilon closure of `node_idx'.  */
1665       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1666       if (BE (err != REG_NOERROR, 0))
1667         return err;
1668
1669       if (dfa->eclosures[node_idx].nelem == 0)
1670         {
1671           incomplete = true;
1672           re_node_set_free (&eclosure_elem);
1673         }
1674     }
1675   return REG_NOERROR;
1676 }
1677
1678 /* Calculate epsilon closure of NODE.  */
1679
1680 static reg_errcode_t
1681 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1682 {
1683   reg_errcode_t err;
1684   Idx i;
1685   re_node_set eclosure;
1686   bool ok;
1687   bool incomplete = false;
1688   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1689   if (BE (err != REG_NOERROR, 0))
1690     return err;
1691
1692   /* This indicates that we are calculating this node now.
1693      We reference this value to avoid infinite loop.  */
1694   dfa->eclosures[node].nelem = REG_MISSING;
1695
1696   /* If the current node has constraints, duplicate all nodes
1697      since they must inherit the constraints.  */
1698   if (dfa->nodes[node].constraint
1699       && dfa->edests[node].nelem
1700       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1701     {
1702       err = duplicate_node_closure (dfa, node, node, node,
1703                                     dfa->nodes[node].constraint);
1704       if (BE (err != REG_NOERROR, 0))
1705         return err;
1706     }
1707
1708   /* Expand each epsilon destination nodes.  */
1709   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1710     for (i = 0; i < dfa->edests[node].nelem; ++i)
1711       {
1712         re_node_set eclosure_elem;
1713         Idx edest = dfa->edests[node].elems[i];
1714         /* If calculating the epsilon closure of `edest' is in progress,
1715            return intermediate result.  */
1716         if (dfa->eclosures[edest].nelem == REG_MISSING)
1717           {
1718             incomplete = true;
1719             continue;
1720           }
1721         /* If we haven't calculated the epsilon closure of `edest' yet,
1722            calculate now. Otherwise use calculated epsilon closure.  */
1723         if (dfa->eclosures[edest].nelem == 0)
1724           {
1725             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1726             if (BE (err != REG_NOERROR, 0))
1727               return err;
1728           }
1729         else
1730           eclosure_elem = dfa->eclosures[edest];
1731         /* Merge the epsilon closure of `edest'.  */
1732         err = re_node_set_merge (&eclosure, &eclosure_elem);
1733         if (BE (err != REG_NOERROR, 0))
1734           return err;
1735         /* If the epsilon closure of `edest' is incomplete,
1736            the epsilon closure of this node is also incomplete.  */
1737         if (dfa->eclosures[edest].nelem == 0)
1738           {
1739             incomplete = true;
1740             re_node_set_free (&eclosure_elem);
1741           }
1742       }
1743
1744   /* An epsilon closure includes itself.  */
1745   ok = re_node_set_insert (&eclosure, node);
1746   if (BE (! ok, 0))
1747     return REG_ESPACE;
1748   if (incomplete && !root)
1749     dfa->eclosures[node].nelem = 0;
1750   else
1751     dfa->eclosures[node] = eclosure;
1752   *new_set = eclosure;
1753   return REG_NOERROR;
1754 }
1755 \f
1756 /* Functions for token which are used in the parser.  */
1757
1758 /* Fetch a token from INPUT.
1759    We must not use this function inside bracket expressions.  */
1760
1761 static void
1762 internal_function
1763 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1764 {
1765   re_string_skip_bytes (input, peek_token (result, input, syntax));
1766 }
1767
1768 /* Peek a token from INPUT, and return the length of the token.
1769    We must not use this function inside bracket expressions.  */
1770
1771 static int
1772 internal_function
1773 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1774 {
1775   unsigned char c;
1776
1777   if (re_string_eoi (input))
1778     {
1779       token->type = END_OF_RE;
1780       return 0;
1781     }
1782
1783   c = re_string_peek_byte (input, 0);
1784   token->opr.c = c;
1785
1786   token->word_char = 0;
1787 #ifdef RE_ENABLE_I18N
1788   token->mb_partial = 0;
1789   if (input->mb_cur_max > 1 &&
1790       !re_string_first_byte (input, re_string_cur_idx (input)))
1791     {
1792       token->type = CHARACTER;
1793       token->mb_partial = 1;
1794       return 1;
1795     }
1796 #endif
1797   if (c == '\\')
1798     {
1799       unsigned char c2;
1800       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1801         {
1802           token->type = BACK_SLASH;
1803           return 1;
1804         }
1805
1806       c2 = re_string_peek_byte_case (input, 1);
1807       token->opr.c = c2;
1808       token->type = CHARACTER;
1809 #ifdef RE_ENABLE_I18N
1810       if (input->mb_cur_max > 1)
1811         {
1812           wint_t wc = re_string_wchar_at (input,
1813                                           re_string_cur_idx (input) + 1);
1814           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1815         }
1816       else
1817 #endif
1818         token->word_char = IS_WORD_CHAR (c2) != 0;
1819
1820       switch (c2)
1821         {
1822         case '|':
1823           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1824             token->type = OP_ALT;
1825           break;
1826         case '1': case '2': case '3': case '4': case '5':
1827         case '6': case '7': case '8': case '9':
1828           if (!(syntax & RE_NO_BK_REFS))
1829             {
1830               token->type = OP_BACK_REF;
1831               token->opr.idx = c2 - '1';
1832             }
1833           break;
1834         case '<':
1835           if (!(syntax & RE_NO_GNU_OPS))
1836             {
1837               token->type = ANCHOR;
1838               token->opr.ctx_type = WORD_FIRST;
1839             }
1840           break;
1841         case '>':
1842           if (!(syntax & RE_NO_GNU_OPS))
1843             {
1844               token->type = ANCHOR;
1845               token->opr.ctx_type = WORD_LAST;
1846             }
1847           break;
1848         case 'b':
1849           if (!(syntax & RE_NO_GNU_OPS))
1850             {
1851               token->type = ANCHOR;
1852               token->opr.ctx_type = WORD_DELIM;
1853             }
1854           break;
1855         case 'B':
1856           if (!(syntax & RE_NO_GNU_OPS))
1857             {
1858               token->type = ANCHOR;
1859               token->opr.ctx_type = NOT_WORD_DELIM;
1860             }
1861           break;
1862         case 'w':
1863           if (!(syntax & RE_NO_GNU_OPS))
1864             token->type = OP_WORD;
1865           break;
1866         case 'W':
1867           if (!(syntax & RE_NO_GNU_OPS))
1868             token->type = OP_NOTWORD;
1869           break;
1870         case 's':
1871           if (!(syntax & RE_NO_GNU_OPS))
1872             token->type = OP_SPACE;
1873           break;
1874         case 'S':
1875           if (!(syntax & RE_NO_GNU_OPS))
1876             token->type = OP_NOTSPACE;
1877           break;
1878         case '`':
1879           if (!(syntax & RE_NO_GNU_OPS))
1880             {
1881               token->type = ANCHOR;
1882               token->opr.ctx_type = BUF_FIRST;
1883             }
1884           break;
1885         case '\'':
1886           if (!(syntax & RE_NO_GNU_OPS))
1887             {
1888               token->type = ANCHOR;
1889               token->opr.ctx_type = BUF_LAST;
1890             }
1891           break;
1892         case '(':
1893           if (!(syntax & RE_NO_BK_PARENS))
1894             token->type = OP_OPEN_SUBEXP;
1895           break;
1896         case ')':
1897           if (!(syntax & RE_NO_BK_PARENS))
1898             token->type = OP_CLOSE_SUBEXP;
1899           break;
1900         case '+':
1901           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902             token->type = OP_DUP_PLUS;
1903           break;
1904         case '?':
1905           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1906             token->type = OP_DUP_QUESTION;
1907           break;
1908         case '{':
1909           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910             token->type = OP_OPEN_DUP_NUM;
1911           break;
1912         case '}':
1913           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1914             token->type = OP_CLOSE_DUP_NUM;
1915           break;
1916         default:
1917           break;
1918         }
1919       return 2;
1920     }
1921
1922   token->type = CHARACTER;
1923 #ifdef RE_ENABLE_I18N
1924   if (input->mb_cur_max > 1)
1925     {
1926       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1927       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1928     }
1929   else
1930 #endif
1931     token->word_char = IS_WORD_CHAR (token->opr.c);
1932
1933   switch (c)
1934     {
1935     case '\n':
1936       if (syntax & RE_NEWLINE_ALT)
1937         token->type = OP_ALT;
1938       break;
1939     case '|':
1940       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1941         token->type = OP_ALT;
1942       break;
1943     case '*':
1944       token->type = OP_DUP_ASTERISK;
1945       break;
1946     case '+':
1947       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948         token->type = OP_DUP_PLUS;
1949       break;
1950     case '?':
1951       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1952         token->type = OP_DUP_QUESTION;
1953       break;
1954     case '{':
1955       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956         token->type = OP_OPEN_DUP_NUM;
1957       break;
1958     case '}':
1959       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1960         token->type = OP_CLOSE_DUP_NUM;
1961       break;
1962     case '(':
1963       if (syntax & RE_NO_BK_PARENS)
1964         token->type = OP_OPEN_SUBEXP;
1965       break;
1966     case ')':
1967       if (syntax & RE_NO_BK_PARENS)
1968         token->type = OP_CLOSE_SUBEXP;
1969       break;
1970     case '[':
1971       token->type = OP_OPEN_BRACKET;
1972       break;
1973     case '.':
1974       token->type = OP_PERIOD;
1975       break;
1976     case '^':
1977       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1978           re_string_cur_idx (input) != 0)
1979         {
1980           char prev = re_string_peek_byte (input, -1);
1981           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1982             break;
1983         }
1984       token->type = ANCHOR;
1985       token->opr.ctx_type = LINE_FIRST;
1986       break;
1987     case '$':
1988       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1989           re_string_cur_idx (input) + 1 != re_string_length (input))
1990         {
1991           re_token_t next;
1992           re_string_skip_bytes (input, 1);
1993           peek_token (&next, input, syntax);
1994           re_string_skip_bytes (input, -1);
1995           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1996             break;
1997         }
1998       token->type = ANCHOR;
1999       token->opr.ctx_type = LINE_LAST;
2000       break;
2001     default:
2002       break;
2003     }
2004   return 1;
2005 }
2006
2007 /* Peek a token from INPUT, and return the length of the token.
2008    We must not use this function out of bracket expressions.  */
2009
2010 static int
2011 internal_function
2012 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2013 {
2014   unsigned char c;
2015   if (re_string_eoi (input))
2016     {
2017       token->type = END_OF_RE;
2018       return 0;
2019     }
2020   c = re_string_peek_byte (input, 0);
2021   token->opr.c = c;
2022
2023 #ifdef RE_ENABLE_I18N
2024   if (input->mb_cur_max > 1 &&
2025       !re_string_first_byte (input, re_string_cur_idx (input)))
2026     {
2027       token->type = CHARACTER;
2028       return 1;
2029     }
2030 #endif /* RE_ENABLE_I18N */
2031
2032   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2033       && re_string_cur_idx (input) + 1 < re_string_length (input))
2034     {
2035       /* In this case, '\' escape a character.  */
2036       unsigned char c2;
2037       re_string_skip_bytes (input, 1);
2038       c2 = re_string_peek_byte (input, 0);
2039       token->opr.c = c2;
2040       token->type = CHARACTER;
2041       return 1;
2042     }
2043   if (c == '[') /* '[' is a special char in a bracket exps.  */
2044     {
2045       unsigned char c2;
2046       int token_len;
2047       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2048         c2 = re_string_peek_byte (input, 1);
2049       else
2050         c2 = 0;
2051       token->opr.c = c2;
2052       token_len = 2;
2053       switch (c2)
2054         {
2055         case '.':
2056           token->type = OP_OPEN_COLL_ELEM;
2057           break;
2058         case '=':
2059           token->type = OP_OPEN_EQUIV_CLASS;
2060           break;
2061         case ':':
2062           if (syntax & RE_CHAR_CLASSES)
2063             {
2064               token->type = OP_OPEN_CHAR_CLASS;
2065               break;
2066             }
2067           /* else fall through.  */
2068         default:
2069           token->type = CHARACTER;
2070           token->opr.c = c;
2071           token_len = 1;
2072           break;
2073         }
2074       return token_len;
2075     }
2076   switch (c)
2077     {
2078     case '-':
2079       token->type = OP_CHARSET_RANGE;
2080       break;
2081     case ']':
2082       token->type = OP_CLOSE_BRACKET;
2083       break;
2084     case '^':
2085       token->type = OP_NON_MATCH_LIST;
2086       break;
2087     default:
2088       token->type = CHARACTER;
2089     }
2090   return 1;
2091 }
2092 \f
2093 /* Functions for parser.  */
2094
2095 /* Entry point of the parser.
2096    Parse the regular expression REGEXP and return the structure tree.
2097    If an error is occured, ERR is set by error code, and return NULL.
2098    This function build the following tree, from regular expression <reg_exp>:
2099            CAT
2100            / \
2101           /   \
2102    <reg_exp>  EOR
2103
2104    CAT means concatenation.
2105    EOR means end of regular expression.  */
2106
2107 static bin_tree_t *
2108 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2109        reg_errcode_t *err)
2110 {
2111   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2112   bin_tree_t *tree, *eor, *root;
2113   re_token_t current_token;
2114   dfa->syntax = syntax;
2115   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2116   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2117   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2118     return NULL;
2119   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2120   if (tree != NULL)
2121     root = create_tree (dfa, tree, eor, CONCAT);
2122   else
2123     root = eor;
2124   if (BE (eor == NULL || root == NULL, 0))
2125     {
2126       *err = REG_ESPACE;
2127       return NULL;
2128     }
2129   return root;
2130 }
2131
2132 /* This function build the following tree, from regular expression
2133    <branch1>|<branch2>:
2134            ALT
2135            / \
2136           /   \
2137    <branch1> <branch2>
2138
2139    ALT means alternative, which represents the operator `|'.  */
2140
2141 static bin_tree_t *
2142 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2143                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2144 {
2145   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2146   bin_tree_t *tree, *branch = NULL;
2147   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2148   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149     return NULL;
2150
2151   while (token->type == OP_ALT)
2152     {
2153       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2154       if (token->type != OP_ALT && token->type != END_OF_RE
2155           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2156         {
2157           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2158           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2159             return NULL;
2160         }
2161       else
2162         branch = NULL;
2163       tree = create_tree (dfa, tree, branch, OP_ALT);
2164       if (BE (tree == NULL, 0))
2165         {
2166           *err = REG_ESPACE;
2167           return NULL;
2168         }
2169     }
2170   return tree;
2171 }
2172
2173 /* This function build the following tree, from regular expression
2174    <exp1><exp2>:
2175         CAT
2176         / \
2177        /   \
2178    <exp1> <exp2>
2179
2180    CAT means concatenation.  */
2181
2182 static bin_tree_t *
2183 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2184               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2185 {
2186   bin_tree_t *tree, *expr;
2187   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2188   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2189   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190     return NULL;
2191
2192   while (token->type != OP_ALT && token->type != END_OF_RE
2193          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194     {
2195       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2196       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2197         {
2198           return NULL;
2199         }
2200       if (tree != NULL && expr != NULL)
2201         {
2202           tree = create_tree (dfa, tree, expr, CONCAT);
2203           if (tree == NULL)
2204             {
2205               *err = REG_ESPACE;
2206               return NULL;
2207             }
2208         }
2209       else if (tree == NULL)
2210         tree = expr;
2211       /* Otherwise expr == NULL, we don't need to create new tree.  */
2212     }
2213   return tree;
2214 }
2215
2216 /* This function build the following tree, from regular expression a*:
2217          *
2218          |
2219          a
2220 */
2221
2222 static bin_tree_t *
2223 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2224                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2225 {
2226   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227   bin_tree_t *tree;
2228   switch (token->type)
2229     {
2230     case CHARACTER:
2231       tree = create_token_tree (dfa, NULL, NULL, token);
2232       if (BE (tree == NULL, 0))
2233         {
2234           *err = REG_ESPACE;
2235           return NULL;
2236         }
2237 #ifdef RE_ENABLE_I18N
2238       if (dfa->mb_cur_max > 1)
2239         {
2240           while (!re_string_eoi (regexp)
2241                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2242             {
2243               bin_tree_t *mbc_remain;
2244               fetch_token (token, regexp, syntax);
2245               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2246               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2247               if (BE (mbc_remain == NULL || tree == NULL, 0))
2248                 {
2249                   *err = REG_ESPACE;
2250                   return NULL;
2251                 }
2252             }
2253         }
2254 #endif
2255       break;
2256     case OP_OPEN_SUBEXP:
2257       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2258       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259         return NULL;
2260       break;
2261     case OP_OPEN_BRACKET:
2262       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2263       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2264         return NULL;
2265       break;
2266     case OP_BACK_REF:
2267       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2268         {
2269           *err = REG_ESUBREG;
2270           return NULL;
2271         }
2272       dfa->used_bkref_map |= 1 << token->opr.idx;
2273       tree = create_token_tree (dfa, NULL, NULL, token);
2274       if (BE (tree == NULL, 0))
2275         {
2276           *err = REG_ESPACE;
2277           return NULL;
2278         }
2279       ++dfa->nbackref;
2280       dfa->has_mb_node = 1;
2281       break;
2282     case OP_OPEN_DUP_NUM:
2283       if (syntax & RE_CONTEXT_INVALID_DUP)
2284         {
2285           *err = REG_BADRPT;
2286           return NULL;
2287         }
2288       /* FALLTHROUGH */
2289     case OP_DUP_ASTERISK:
2290     case OP_DUP_PLUS:
2291     case OP_DUP_QUESTION:
2292       if (syntax & RE_CONTEXT_INVALID_OPS)
2293         {
2294           *err = REG_BADRPT;
2295           return NULL;
2296         }
2297       else if (syntax & RE_CONTEXT_INDEP_OPS)
2298         {
2299           fetch_token (token, regexp, syntax);
2300           return parse_expression (regexp, preg, token, syntax, nest, err);
2301         }
2302       /* else fall through  */
2303     case OP_CLOSE_SUBEXP:
2304       if ((token->type == OP_CLOSE_SUBEXP) &&
2305           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2306         {
2307           *err = REG_ERPAREN;
2308           return NULL;
2309         }
2310       /* else fall through  */
2311     case OP_CLOSE_DUP_NUM:
2312       /* We treat it as a normal character.  */
2313
2314       /* Then we can these characters as normal characters.  */
2315       token->type = CHARACTER;
2316       /* mb_partial and word_char bits should be initialized already
2317          by peek_token.  */
2318       tree = create_token_tree (dfa, NULL, NULL, token);
2319       if (BE (tree == NULL, 0))
2320         {
2321           *err = REG_ESPACE;
2322           return NULL;
2323         }
2324       break;
2325     case ANCHOR:
2326       if ((token->opr.ctx_type
2327            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2328           && dfa->word_ops_used == 0)
2329         init_word_char (dfa);
2330       if (token->opr.ctx_type == WORD_DELIM
2331           || token->opr.ctx_type == NOT_WORD_DELIM)
2332         {
2333           bin_tree_t *tree_first, *tree_last;
2334           if (token->opr.ctx_type == WORD_DELIM)
2335             {
2336               token->opr.ctx_type = WORD_FIRST;
2337               tree_first = create_token_tree (dfa, NULL, NULL, token);
2338               token->opr.ctx_type = WORD_LAST;
2339             }
2340           else
2341             {
2342               token->opr.ctx_type = INSIDE_WORD;
2343               tree_first = create_token_tree (dfa, NULL, NULL, token);
2344               token->opr.ctx_type = INSIDE_NOTWORD;
2345             }
2346           tree_last = create_token_tree (dfa, NULL, NULL, token);
2347           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2348           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2349             {
2350               *err = REG_ESPACE;
2351               return NULL;
2352             }
2353         }
2354       else
2355         {
2356           tree = create_token_tree (dfa, NULL, NULL, token);
2357           if (BE (tree == NULL, 0))
2358             {
2359               *err = REG_ESPACE;
2360               return NULL;
2361             }
2362         }
2363       /* We must return here, since ANCHORs can't be followed
2364          by repetition operators.
2365          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2366              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2367       fetch_token (token, regexp, syntax);
2368       return tree;
2369     case OP_PERIOD:
2370       tree = create_token_tree (dfa, NULL, NULL, token);
2371       if (BE (tree == NULL, 0))
2372         {
2373           *err = REG_ESPACE;
2374           return NULL;
2375         }
2376       if (dfa->mb_cur_max > 1)
2377         dfa->has_mb_node = 1;
2378       break;
2379     case OP_WORD:
2380     case OP_NOTWORD:
2381       tree = build_charclass_op (dfa, regexp->trans,
2382                                  (const unsigned char *) "alnum",
2383                                  (const unsigned char *) "_",
2384                                  token->type == OP_NOTWORD, err);
2385       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2386         return NULL;
2387       break;
2388     case OP_SPACE:
2389     case OP_NOTSPACE:
2390       tree = build_charclass_op (dfa, regexp->trans,
2391                                  (const unsigned char *) "space",
2392                                  (const unsigned char *) "",
2393                                  token->type == OP_NOTSPACE, err);
2394       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395         return NULL;
2396       break;
2397     case OP_ALT:
2398     case END_OF_RE:
2399       return NULL;
2400     case BACK_SLASH:
2401       *err = REG_EESCAPE;
2402       return NULL;
2403     default:
2404       /* Must not happen?  */
2405 #ifdef DEBUG
2406       assert (0);
2407 #endif
2408       return NULL;
2409     }
2410   fetch_token (token, regexp, syntax);
2411
2412   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2413          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2414     {
2415       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2416       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2417         return NULL;
2418       /* In BRE consecutive duplications are not allowed.  */
2419       if ((syntax & RE_CONTEXT_INVALID_DUP)
2420           && (token->type == OP_DUP_ASTERISK
2421               || token->type == OP_OPEN_DUP_NUM))
2422         {
2423           *err = REG_BADRPT;
2424           return NULL;
2425         }
2426     }
2427
2428   return tree;
2429 }
2430
2431 /* This function build the following tree, from regular expression
2432    (<reg_exp>):
2433          SUBEXP
2434             |
2435         <reg_exp>
2436 */
2437
2438 static bin_tree_t *
2439 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2440                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2441 {
2442   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2443   bin_tree_t *tree;
2444   size_t cur_nsub;
2445   cur_nsub = preg->re_nsub++;
2446
2447   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2448
2449   /* The subexpression may be a null string.  */
2450   if (token->type == OP_CLOSE_SUBEXP)
2451     tree = NULL;
2452   else
2453     {
2454       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2455       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2456         *err = REG_EPAREN;
2457       if (BE (*err != REG_NOERROR, 0))
2458         return NULL;
2459     }
2460
2461   if (cur_nsub <= '9' - '1')
2462     dfa->completed_bkref_map |= 1 << cur_nsub;
2463
2464   tree = create_tree (dfa, tree, NULL, SUBEXP);
2465   if (BE (tree == NULL, 0))
2466     {
2467       *err = REG_ESPACE;
2468       return NULL;
2469     }
2470   tree->token.opr.idx = cur_nsub;
2471   return tree;
2472 }
2473
2474 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2475
2476 static bin_tree_t *
2477 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2478               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2479 {
2480   bin_tree_t *tree = NULL, *old_tree = NULL;
2481   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2482   re_token_t start_token = *token;
2483
2484   if (token->type == OP_OPEN_DUP_NUM)
2485     {
2486       end = 0;
2487       start = fetch_number (regexp, token, syntax);
2488       if (start == REG_MISSING)
2489         {
2490           if (token->type == CHARACTER && token->opr.c == ',')
2491             start = 0; /* We treat "{,m}" as "{0,m}".  */
2492           else
2493             {
2494               *err = REG_BADBR; /* <re>{} is invalid.  */
2495               return NULL;
2496             }
2497         }
2498       if (BE (start != REG_ERROR, 1))
2499         {
2500           /* We treat "{n}" as "{n,n}".  */
2501           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2502                  : ((token->type == CHARACTER && token->opr.c == ',')
2503                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2504         }
2505       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2506         {
2507           /* Invalid sequence.  */
2508           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2509             {
2510               if (token->type == END_OF_RE)
2511                 *err = REG_EBRACE;
2512               else
2513                 *err = REG_BADBR;
2514
2515               return NULL;
2516             }
2517
2518           /* If the syntax bit is set, rollback.  */
2519           re_string_set_index (regexp, start_idx);
2520           *token = start_token;
2521           token->type = CHARACTER;
2522           /* mb_partial and word_char bits should be already initialized by
2523              peek_token.  */
2524           return elem;
2525         }
2526
2527       if (BE ((end != REG_MISSING && start > end)
2528               || token->type != OP_CLOSE_DUP_NUM, 0))
2529         {
2530           /* First number greater than second.  */
2531           *err = REG_BADBR;
2532           return NULL;
2533         }
2534     }
2535   else
2536     {
2537       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2538       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2539     }
2540
2541   fetch_token (token, regexp, syntax);
2542
2543   if (BE (elem == NULL, 0))
2544     return NULL;
2545   if (BE (start == 0 && end == 0, 0))
2546     {
2547       postorder (elem, free_tree, NULL);
2548       return NULL;
2549     }
2550
2551   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2552   if (BE (start > 0, 0))
2553     {
2554       tree = elem;
2555       for (i = 2; i <= start; ++i)
2556         {
2557           elem = duplicate_tree (elem, dfa);
2558           tree = create_tree (dfa, tree, elem, CONCAT);
2559           if (BE (elem == NULL || tree == NULL, 0))
2560             goto parse_dup_op_espace;
2561         }
2562
2563       if (start == end)
2564         return tree;
2565
2566       /* Duplicate ELEM before it is marked optional.  */
2567       elem = duplicate_tree (elem, dfa);
2568       old_tree = tree;
2569     }
2570   else
2571     old_tree = NULL;
2572
2573   if (elem->token.type == SUBEXP)
2574     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2575
2576   tree = create_tree (dfa, elem, NULL,
2577                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2578   if (BE (tree == NULL, 0))
2579     goto parse_dup_op_espace;
2580
2581 /* From gnulib's "intprops.h":
2582    True if the arithmetic type T is signed.  */
2583 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2584
2585   /* This loop is actually executed only when end != REG_MISSING,
2586      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2587      already created the start+1-th copy.  */
2588   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2589     for (i = start + 2; i <= end; ++i)
2590       {
2591         elem = duplicate_tree (elem, dfa);
2592         tree = create_tree (dfa, tree, elem, CONCAT);
2593         if (BE (elem == NULL || tree == NULL, 0))
2594           goto parse_dup_op_espace;
2595
2596         tree = create_tree (dfa, tree, NULL, OP_ALT);
2597         if (BE (tree == NULL, 0))
2598           goto parse_dup_op_espace;
2599       }
2600
2601   if (old_tree)
2602     tree = create_tree (dfa, old_tree, tree, CONCAT);
2603
2604   return tree;
2605
2606  parse_dup_op_espace:
2607   *err = REG_ESPACE;
2608   return NULL;
2609 }
2610
2611 /* Size of the names for collating symbol/equivalence_class/character_class.
2612    I'm not sure, but maybe enough.  */
2613 #define BRACKET_NAME_BUF_SIZE 32
2614
2615 #ifndef _LIBC
2616   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2617      Build the range expression which starts from START_ELEM, and ends
2618      at END_ELEM.  The result are written to MBCSET and SBCSET.
2619      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2620      mbcset->range_ends, is a pointer argument sinse we may
2621      update it.  */
2622
2623 static reg_errcode_t
2624 internal_function
2625 # ifdef RE_ENABLE_I18N
2626 build_range_exp (const reg_syntax_t syntax,
2627                  bitset_t sbcset,
2628                  re_charset_t *mbcset,
2629                  Idx *range_alloc,
2630                  const bracket_elem_t *start_elem,
2631                  const bracket_elem_t *end_elem)
2632 # else /* not RE_ENABLE_I18N */
2633 build_range_exp (const reg_syntax_t syntax,
2634                  bitset_t sbcset,
2635                  const bracket_elem_t *start_elem,
2636                  const bracket_elem_t *end_elem)
2637 # endif /* not RE_ENABLE_I18N */
2638 {
2639   unsigned int start_ch, end_ch;
2640   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2641   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2642           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2643           0))
2644     return REG_ERANGE;
2645
2646   /* We can handle no multi character collating elements without libc
2647      support.  */
2648   if (BE ((start_elem->type == COLL_SYM
2649            && strlen ((char *) start_elem->opr.name) > 1)
2650           || (end_elem->type == COLL_SYM
2651               && strlen ((char *) end_elem->opr.name) > 1), 0))
2652     return REG_ECOLLATE;
2653
2654 # ifdef RE_ENABLE_I18N
2655   {
2656     wchar_t wc;
2657     wint_t start_wc;
2658     wint_t end_wc;
2659     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2660
2661     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2662                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2663                    : 0));
2664     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2665               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2666                  : 0));
2667     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2668                 ? __btowc (start_ch) : start_elem->opr.wch);
2669     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2670               ? __btowc (end_ch) : end_elem->opr.wch);
2671     if (start_wc == WEOF || end_wc == WEOF)
2672       return REG_ECOLLATE;
2673     cmp_buf[0] = start_wc;
2674     cmp_buf[4] = end_wc;
2675
2676     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2677             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2678       return REG_ERANGE;
2679
2680     /* Got valid collation sequence values, add them as a new entry.
2681        However, for !_LIBC we have no collation elements: if the
2682        character set is single byte, the single byte character set
2683        that we build below suffices.  parse_bracket_exp passes
2684        no MBCSET if dfa->mb_cur_max == 1.  */
2685     if (mbcset)
2686       {
2687         /* Check the space of the arrays.  */
2688         if (BE (*range_alloc == mbcset->nranges, 0))
2689           {
2690             /* There is not enough space, need realloc.  */
2691             wchar_t *new_array_start, *new_array_end;
2692             Idx new_nranges;
2693
2694             /* +1 in case of mbcset->nranges is 0.  */
2695             new_nranges = 2 * mbcset->nranges + 1;
2696             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2697                are NULL if *range_alloc == 0.  */
2698             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2699                                           new_nranges);
2700             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2701                                         new_nranges);
2702
2703             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2704               return REG_ESPACE;
2705
2706             mbcset->range_starts = new_array_start;
2707             mbcset->range_ends = new_array_end;
2708             *range_alloc = new_nranges;
2709           }
2710
2711         mbcset->range_starts[mbcset->nranges] = start_wc;
2712         mbcset->range_ends[mbcset->nranges++] = end_wc;
2713       }
2714
2715     /* Build the table for single byte characters.  */
2716     for (wc = 0; wc < SBC_MAX; ++wc)
2717       {
2718         cmp_buf[2] = wc;
2719         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2720             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2721           bitset_set (sbcset, wc);
2722       }
2723   }
2724 # else /* not RE_ENABLE_I18N */
2725   {
2726     unsigned int ch;
2727     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2728                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2729                    : 0));
2730     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2731               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2732                  : 0));
2733     if (start_ch > end_ch)
2734       return REG_ERANGE;
2735     /* Build the table for single byte characters.  */
2736     for (ch = 0; ch < SBC_MAX; ++ch)
2737       if (start_ch <= ch  && ch <= end_ch)
2738         bitset_set (sbcset, ch);
2739   }
2740 # endif /* not RE_ENABLE_I18N */
2741   return REG_NOERROR;
2742 }
2743 #endif /* not _LIBC */
2744
2745 #ifndef _LIBC
2746 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2747    Build the collating element which is represented by NAME.
2748    The result are written to MBCSET and SBCSET.
2749    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2750    pointer argument since we may update it.  */
2751
2752 static reg_errcode_t
2753 internal_function
2754 build_collating_symbol (bitset_t sbcset,
2755 # ifdef RE_ENABLE_I18N
2756                         re_charset_t *mbcset _UNUSED_PARAMETER_,
2757                         Idx *coll_sym_alloc _UNUSED_PARAMETER_,
2758 # endif
2759                         const unsigned char *name)
2760 {
2761   size_t name_len = strlen ((const char *) name);
2762   if (BE (name_len != 1, 0))
2763     return REG_ECOLLATE;
2764   else
2765     {
2766       bitset_set (sbcset, name[0]);
2767       return REG_NOERROR;
2768     }
2769 }
2770 #endif /* not _LIBC */
2771
2772 /* This function parse bracket expression like "[abc]", "[a-c]",
2773    "[[.a-a.]]" etc.  */
2774
2775 static bin_tree_t *
2776 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2777                    reg_syntax_t syntax, reg_errcode_t *err)
2778 {
2779 #ifdef _LIBC
2780   const unsigned char *collseqmb;
2781   const char *collseqwc;
2782   uint32_t nrules;
2783   int32_t table_size;
2784   const int32_t *symb_table;
2785   const unsigned char *extra;
2786
2787   /* Local function for parse_bracket_exp used in _LIBC environement.
2788      Seek the collating symbol entry correspondings to NAME.
2789      Return the index of the symbol in the SYMB_TABLE.  */
2790
2791   auto inline int32_t
2792   __attribute ((always_inline))
2793   seek_collating_symbol_entry (name, name_len)
2794          const unsigned char *name;
2795          size_t name_len;
2796     {
2797       int32_t hash = elem_hash ((const char *) name, name_len);
2798       int32_t elem = hash % table_size;
2799       if (symb_table[2 * elem] != 0)
2800         {
2801           int32_t second = hash % (table_size - 2) + 1;
2802
2803           do
2804             {
2805               /* First compare the hashing value.  */
2806               if (symb_table[2 * elem] == hash
2807                   /* Compare the length of the name.  */
2808                   && name_len == extra[symb_table[2 * elem + 1]]
2809                   /* Compare the name.  */
2810                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2811                              name_len) == 0)
2812                 {
2813                   /* Yep, this is the entry.  */
2814                   break;
2815                 }
2816
2817               /* Next entry.  */
2818               elem += second;
2819             }
2820           while (symb_table[2 * elem] != 0);
2821         }
2822       return elem;
2823     }
2824
2825   /* Local function for parse_bracket_exp used in _LIBC environment.
2826      Look up the collation sequence value of BR_ELEM.
2827      Return the value if succeeded, UINT_MAX otherwise.  */
2828
2829   auto inline unsigned int
2830   __attribute ((always_inline))
2831   lookup_collation_sequence_value (br_elem)
2832          bracket_elem_t *br_elem;
2833     {
2834       if (br_elem->type == SB_CHAR)
2835         {
2836           /*
2837           if (MB_CUR_MAX == 1)
2838           */
2839           if (nrules == 0)
2840             return collseqmb[br_elem->opr.ch];
2841           else
2842             {
2843               wint_t wc = __btowc (br_elem->opr.ch);
2844               return __collseq_table_lookup (collseqwc, wc);
2845             }
2846         }
2847       else if (br_elem->type == MB_CHAR)
2848         {
2849           if (nrules != 0)
2850             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2851         }
2852       else if (br_elem->type == COLL_SYM)
2853         {
2854           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2855           if (nrules != 0)
2856             {
2857               int32_t elem, idx;
2858               elem = seek_collating_symbol_entry (br_elem->opr.name,
2859                                                   sym_name_len);
2860               if (symb_table[2 * elem] != 0)
2861                 {
2862                   /* We found the entry.  */
2863                   idx = symb_table[2 * elem + 1];
2864                   /* Skip the name of collating element name.  */
2865                   idx += 1 + extra[idx];
2866                   /* Skip the byte sequence of the collating element.  */
2867                   idx += 1 + extra[idx];
2868                   /* Adjust for the alignment.  */
2869                   idx = (idx + 3) & ~3;
2870                   /* Skip the multibyte collation sequence value.  */
2871                   idx += sizeof (unsigned int);
2872                   /* Skip the wide char sequence of the collating element.  */
2873                   idx += sizeof (unsigned int) *
2874                     (1 + *(unsigned int *) (extra + idx));
2875                   /* Return the collation sequence value.  */
2876                   return *(unsigned int *) (extra + idx);
2877                 }
2878               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2879                 {
2880                   /* No valid character.  Match it as a single byte
2881                      character.  */
2882                   return collseqmb[br_elem->opr.name[0]];
2883                 }
2884             }
2885           else if (sym_name_len == 1)
2886             return collseqmb[br_elem->opr.name[0]];
2887         }
2888       return UINT_MAX;
2889     }
2890
2891   /* Local function for parse_bracket_exp used in _LIBC environement.
2892      Build the range expression which starts from START_ELEM, and ends
2893      at END_ELEM.  The result are written to MBCSET and SBCSET.
2894      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2895      mbcset->range_ends, is a pointer argument sinse we may
2896      update it.  */
2897
2898   auto inline reg_errcode_t
2899   __attribute ((always_inline))
2900   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2901          re_charset_t *mbcset;
2902          Idx *range_alloc;
2903          bitset_t sbcset;
2904          bracket_elem_t *start_elem, *end_elem;
2905     {
2906       unsigned int ch;
2907       uint32_t start_collseq;
2908       uint32_t end_collseq;
2909
2910       /* Equivalence Classes and Character Classes can't be a range
2911          start/end.  */
2912       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2913               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2914               0))
2915         return REG_ERANGE;
2916
2917       start_collseq = lookup_collation_sequence_value (start_elem);
2918       end_collseq = lookup_collation_sequence_value (end_elem);
2919       /* Check start/end collation sequence values.  */
2920       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2921         return REG_ECOLLATE;
2922       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2923         return REG_ERANGE;
2924
2925       /* Got valid collation sequence values, add them as a new entry.
2926          However, if we have no collation elements, and the character set
2927          is single byte, the single byte character set that we
2928          build below suffices. */
2929       if (nrules > 0 || dfa->mb_cur_max > 1)
2930         {
2931           /* Check the space of the arrays.  */
2932           if (BE (*range_alloc == mbcset->nranges, 0))
2933             {
2934               /* There is not enough space, need realloc.  */
2935               uint32_t *new_array_start;
2936               uint32_t *new_array_end;
2937               Idx new_nranges;
2938
2939               /* +1 in case of mbcset->nranges is 0.  */
2940               new_nranges = 2 * mbcset->nranges + 1;
2941               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2942                                             new_nranges);
2943               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2944                                           new_nranges);
2945
2946               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2947                 return REG_ESPACE;
2948
2949               mbcset->range_starts = new_array_start;
2950               mbcset->range_ends = new_array_end;
2951               *range_alloc = new_nranges;
2952             }
2953
2954           mbcset->range_starts[mbcset->nranges] = start_collseq;
2955           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2956         }
2957
2958       /* Build the table for single byte characters.  */
2959       for (ch = 0; ch < SBC_MAX; ch++)
2960         {
2961           uint32_t ch_collseq;
2962           /*
2963           if (MB_CUR_MAX == 1)
2964           */
2965           if (nrules == 0)
2966             ch_collseq = collseqmb[ch];
2967           else
2968             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2969           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2970             bitset_set (sbcset, ch);
2971         }
2972       return REG_NOERROR;
2973     }
2974
2975   /* Local function for parse_bracket_exp used in _LIBC environement.
2976      Build the collating element which is represented by NAME.
2977      The result are written to MBCSET and SBCSET.
2978      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2979      pointer argument sinse we may update it.  */
2980
2981   auto inline reg_errcode_t
2982   __attribute ((always_inline))
2983   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2984          re_charset_t *mbcset;
2985          Idx *coll_sym_alloc;
2986          bitset_t sbcset;
2987          const unsigned char *name;
2988     {
2989       int32_t elem, idx;
2990       size_t name_len = strlen ((const char *) name);
2991       if (nrules != 0)
2992         {
2993           elem = seek_collating_symbol_entry (name, name_len);
2994           if (symb_table[2 * elem] != 0)
2995             {
2996               /* We found the entry.  */
2997               idx = symb_table[2 * elem + 1];
2998               /* Skip the name of collating element name.  */
2999               idx += 1 + extra[idx];
3000             }
3001           else if (symb_table[2 * elem] == 0 && name_len == 1)
3002             {
3003               /* No valid character, treat it as a normal
3004                  character.  */
3005               bitset_set (sbcset, name[0]);
3006               return REG_NOERROR;
3007             }
3008           else
3009             return REG_ECOLLATE;
3010
3011           /* Got valid collation sequence, add it as a new entry.  */
3012           /* Check the space of the arrays.  */
3013           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3014             {
3015               /* Not enough, realloc it.  */
3016               /* +1 in case of mbcset->ncoll_syms is 0.  */
3017               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3018               /* Use realloc since mbcset->coll_syms is NULL
3019                  if *alloc == 0.  */
3020               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3021                                                    new_coll_sym_alloc);
3022               if (BE (new_coll_syms == NULL, 0))
3023                 return REG_ESPACE;
3024               mbcset->coll_syms = new_coll_syms;
3025               *coll_sym_alloc = new_coll_sym_alloc;
3026             }
3027           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3028           return REG_NOERROR;
3029         }
3030       else
3031         {
3032           if (BE (name_len != 1, 0))
3033             return REG_ECOLLATE;
3034           else
3035             {
3036               bitset_set (sbcset, name[0]);
3037               return REG_NOERROR;
3038             }
3039         }
3040     }
3041 #endif
3042
3043   re_token_t br_token;
3044   re_bitset_ptr_t sbcset;
3045 #ifdef RE_ENABLE_I18N
3046   re_charset_t *mbcset;
3047   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3048   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3049 #endif /* not RE_ENABLE_I18N */
3050   bool non_match = false;
3051   bin_tree_t *work_tree;
3052   int token_len;
3053   bool first_round = true;
3054 #ifdef _LIBC
3055   collseqmb = (const unsigned char *)
3056     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3057   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3058   if (nrules)
3059     {
3060       /*
3061       if (MB_CUR_MAX > 1)
3062       */
3063       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3064       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3065       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3066                                                   _NL_COLLATE_SYMB_TABLEMB);
3067       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3068                                                    _NL_COLLATE_SYMB_EXTRAMB);
3069     }
3070 #endif
3071   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3072 #ifdef RE_ENABLE_I18N
3073   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3074 #endif /* RE_ENABLE_I18N */
3075 #ifdef RE_ENABLE_I18N
3076   if (BE (sbcset == NULL || mbcset == NULL, 0))
3077 #else
3078   if (BE (sbcset == NULL, 0))
3079 #endif /* RE_ENABLE_I18N */
3080     {
3081       *err = REG_ESPACE;
3082       return NULL;
3083     }
3084
3085   token_len = peek_token_bracket (token, regexp, syntax);
3086   if (BE (token->type == END_OF_RE, 0))
3087     {
3088       *err = REG_BADPAT;
3089       goto parse_bracket_exp_free_return;
3090     }
3091   if (token->type == OP_NON_MATCH_LIST)
3092     {
3093 #ifdef RE_ENABLE_I18N
3094       mbcset->non_match = 1;
3095 #endif /* not RE_ENABLE_I18N */
3096       non_match = true;
3097       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3098         bitset_set (sbcset, '\n');
3099       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3100       token_len = peek_token_bracket (token, regexp, syntax);
3101       if (BE (token->type == END_OF_RE, 0))
3102         {
3103           *err = REG_BADPAT;
3104           goto parse_bracket_exp_free_return;
3105         }
3106     }
3107
3108   /* We treat the first ']' as a normal character.  */
3109   if (token->type == OP_CLOSE_BRACKET)
3110     token->type = CHARACTER;
3111
3112   while (1)
3113     {
3114       bracket_elem_t start_elem, end_elem;
3115       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3116       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3117       reg_errcode_t ret;
3118       int token_len2 = 0;
3119       bool is_range_exp = false;
3120       re_token_t token2;
3121
3122       start_elem.opr.name = start_name_buf;
3123       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3124                                    syntax, first_round);
3125       if (BE (ret != REG_NOERROR, 0))
3126         {
3127           *err = ret;
3128           goto parse_bracket_exp_free_return;
3129         }
3130       first_round = false;
3131
3132       /* Get information about the next token.  We need it in any case.  */
3133       token_len = peek_token_bracket (token, regexp, syntax);
3134
3135       /* Do not check for ranges if we know they are not allowed.  */
3136       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3137         {
3138           if (BE (token->type == END_OF_RE, 0))
3139             {
3140               *err = REG_EBRACK;
3141               goto parse_bracket_exp_free_return;
3142             }
3143           if (token->type == OP_CHARSET_RANGE)
3144             {
3145               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3146               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3147               if (BE (token2.type == END_OF_RE, 0))
3148                 {
3149                   *err = REG_EBRACK;
3150                   goto parse_bracket_exp_free_return;
3151                 }
3152               if (token2.type == OP_CLOSE_BRACKET)
3153                 {
3154                   /* We treat the last '-' as a normal character.  */
3155                   re_string_skip_bytes (regexp, -token_len);
3156                   token->type = CHARACTER;
3157                 }
3158               else
3159                 is_range_exp = true;
3160             }
3161         }
3162
3163       if (is_range_exp == true)
3164         {
3165           end_elem.opr.name = end_name_buf;
3166           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3167                                        dfa, syntax, true);
3168           if (BE (ret != REG_NOERROR, 0))
3169             {
3170               *err = ret;
3171               goto parse_bracket_exp_free_return;
3172             }
3173
3174           token_len = peek_token_bracket (token, regexp, syntax);
3175
3176 #ifdef _LIBC
3177           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3178                                   &start_elem, &end_elem);
3179 #else
3180 # ifdef RE_ENABLE_I18N
3181           *err = build_range_exp (syntax, sbcset,
3182                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3183                                   &range_alloc, &start_elem, &end_elem);
3184 # else
3185           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3186 # endif
3187 #endif /* RE_ENABLE_I18N */
3188           if (BE (*err != REG_NOERROR, 0))
3189             goto parse_bracket_exp_free_return;
3190         }
3191       else
3192         {
3193           switch (start_elem.type)
3194             {
3195             case SB_CHAR:
3196               bitset_set (sbcset, start_elem.opr.ch);
3197               break;
3198 #ifdef RE_ENABLE_I18N
3199             case MB_CHAR:
3200               /* Check whether the array has enough space.  */
3201               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3202                 {
3203                   wchar_t *new_mbchars;
3204                   /* Not enough, realloc it.  */
3205                   /* +1 in case of mbcset->nmbchars is 0.  */
3206                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3207                   /* Use realloc since array is NULL if *alloc == 0.  */
3208                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3209                                             mbchar_alloc);
3210                   if (BE (new_mbchars == NULL, 0))
3211                     goto parse_bracket_exp_espace;
3212                   mbcset->mbchars = new_mbchars;
3213                 }
3214               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3215               break;
3216 #endif /* RE_ENABLE_I18N */
3217             case EQUIV_CLASS:
3218               *err = build_equiv_class (sbcset,
3219 #ifdef RE_ENABLE_I18N
3220                                         mbcset, &equiv_class_alloc,
3221 #endif /* RE_ENABLE_I18N */
3222                                         start_elem.opr.name);
3223               if (BE (*err != REG_NOERROR, 0))
3224                 goto parse_bracket_exp_free_return;
3225               break;
3226             case COLL_SYM:
3227               *err = build_collating_symbol (sbcset,
3228 #ifdef RE_ENABLE_I18N
3229                                              mbcset, &coll_sym_alloc,
3230 #endif /* RE_ENABLE_I18N */
3231                                              start_elem.opr.name);
3232               if (BE (*err != REG_NOERROR, 0))
3233                 goto parse_bracket_exp_free_return;
3234               break;
3235             case CHAR_CLASS:
3236               *err = build_charclass (regexp->trans, sbcset,
3237 #ifdef RE_ENABLE_I18N
3238                                       mbcset, &char_class_alloc,
3239 #endif /* RE_ENABLE_I18N */
3240                                       start_elem.opr.name, syntax);
3241               if (BE (*err != REG_NOERROR, 0))
3242                goto parse_bracket_exp_free_return;
3243               break;
3244             default:
3245               assert (0);
3246               break;
3247             }
3248         }
3249       if (BE (token->type == END_OF_RE, 0))
3250         {
3251           *err = REG_EBRACK;
3252           goto parse_bracket_exp_free_return;
3253         }
3254       if (token->type == OP_CLOSE_BRACKET)
3255         break;
3256     }
3257
3258   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3259
3260   /* If it is non-matching list.  */
3261   if (non_match)
3262     bitset_not (sbcset);
3263
3264 #ifdef RE_ENABLE_I18N
3265   /* Ensure only single byte characters are set.  */
3266   if (dfa->mb_cur_max > 1)
3267     bitset_mask (sbcset, dfa->sb_char);
3268
3269   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3270       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3271                                                      || mbcset->non_match)))
3272     {
3273       bin_tree_t *mbc_tree;
3274       int sbc_idx;
3275       /* Build a tree for complex bracket.  */
3276       dfa->has_mb_node = 1;
3277       br_token.type = COMPLEX_BRACKET;
3278       br_token.opr.mbcset = mbcset;
3279       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3280       if (BE (mbc_tree == NULL, 0))
3281         goto parse_bracket_exp_espace;
3282       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3283         if (sbcset[sbc_idx])
3284           break;
3285       /* If there are no bits set in sbcset, there is no point
3286          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3287       if (sbc_idx < BITSET_WORDS)
3288         {
3289           /* Build a tree for simple bracket.  */
3290           br_token.type = SIMPLE_BRACKET;
3291           br_token.opr.sbcset = sbcset;
3292           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3293           if (BE (work_tree == NULL, 0))
3294             goto parse_bracket_exp_espace;
3295
3296           /* Then join them by ALT node.  */
3297           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3298           if (BE (work_tree == NULL, 0))
3299             goto parse_bracket_exp_espace;
3300         }
3301       else
3302         {
3303           re_free (sbcset);
3304           work_tree = mbc_tree;
3305         }
3306     }
3307   else
3308 #endif /* not RE_ENABLE_I18N */
3309     {
3310 #ifdef RE_ENABLE_I18N
3311       free_charset (mbcset);
3312 #endif
3313       /* Build a tree for simple bracket.  */
3314       br_token.type = SIMPLE_BRACKET;
3315       br_token.opr.sbcset = sbcset;
3316       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3317       if (BE (work_tree == NULL, 0))
3318         goto parse_bracket_exp_espace;
3319     }
3320   return work_tree;
3321
3322  parse_bracket_exp_espace:
3323   *err = REG_ESPACE;
3324  parse_bracket_exp_free_return:
3325   re_free (sbcset);
3326 #ifdef RE_ENABLE_I18N
3327   free_charset (mbcset);
3328 #endif /* RE_ENABLE_I18N */
3329   return NULL;
3330 }
3331
3332 /* Parse an element in the bracket expression.  */
3333
3334 static reg_errcode_t
3335 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3336                        re_token_t *token, int token_len,
3337                        re_dfa_t *dfa _UNUSED_PARAMETER_,
3338                        reg_syntax_t syntax, bool accept_hyphen)
3339 {
3340 #ifdef RE_ENABLE_I18N
3341   int cur_char_size;
3342   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3343   if (cur_char_size > 1)
3344     {
3345       elem->type = MB_CHAR;
3346       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3347       re_string_skip_bytes (regexp, cur_char_size);
3348       return REG_NOERROR;
3349     }
3350 #endif /* RE_ENABLE_I18N */
3351   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3352   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3353       || token->type == OP_OPEN_EQUIV_CLASS)
3354     return parse_bracket_symbol (elem, regexp, token);
3355   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3356     {
3357       /* A '-' must only appear as anything but a range indicator before
3358          the closing bracket.  Everything else is an error.  */
3359       re_token_t token2;
3360       (void) peek_token_bracket (&token2, regexp, syntax);
3361       if (token2.type != OP_CLOSE_BRACKET)
3362         /* The actual error value is not standardized since this whole
3363            case is undefined.  But ERANGE makes good sense.  */
3364         return REG_ERANGE;
3365     }
3366   elem->type = SB_CHAR;
3367   elem->opr.ch = token->opr.c;
3368   return REG_NOERROR;
3369 }
3370
3371 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3372    such as [:<character_class>:], [.<collating_element>.], and
3373    [=<equivalent_class>=].  */
3374
3375 static reg_errcode_t
3376 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3377                       re_token_t *token)
3378 {
3379   unsigned char ch, delim = token->opr.c;
3380   int i = 0;
3381   if (re_string_eoi(regexp))
3382     return REG_EBRACK;
3383   for (;; ++i)
3384     {
3385       if (i >= BRACKET_NAME_BUF_SIZE)
3386         return REG_EBRACK;
3387       if (token->type == OP_OPEN_CHAR_CLASS)
3388         ch = re_string_fetch_byte_case (regexp);
3389       else
3390         ch = re_string_fetch_byte (regexp);
3391       if (re_string_eoi(regexp))
3392         return REG_EBRACK;
3393       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3394         break;
3395       elem->opr.name[i] = ch;
3396     }
3397   re_string_skip_bytes (regexp, 1);
3398   elem->opr.name[i] = '\0';
3399   switch (token->type)
3400     {
3401     case OP_OPEN_COLL_ELEM:
3402       elem->type = COLL_SYM;
3403       break;
3404     case OP_OPEN_EQUIV_CLASS:
3405       elem->type = EQUIV_CLASS;
3406       break;
3407     case OP_OPEN_CHAR_CLASS:
3408       elem->type = CHAR_CLASS;
3409       break;
3410     default:
3411       break;
3412     }
3413   return REG_NOERROR;
3414 }
3415
3416   /* Helper function for parse_bracket_exp.
3417      Build the equivalence class which is represented by NAME.
3418      The result are written to MBCSET and SBCSET.
3419      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3420      is a pointer argument sinse we may update it.  */
3421
3422 static reg_errcode_t
3423 #ifdef RE_ENABLE_I18N
3424 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset _UNUSED_PARAMETER_,
3425                    Idx *equiv_class_alloc _UNUSED_PARAMETER_,
3426                    const unsigned char *name)
3427 #else /* not RE_ENABLE_I18N */
3428 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3429 #endif /* not RE_ENABLE_I18N */
3430 {
3431 #ifdef _LIBC
3432   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3433   if (nrules != 0)
3434     {
3435       const int32_t *table, *indirect;
3436       const unsigned char *weights, *extra, *cp;
3437       unsigned char char_buf[2];
3438       int32_t idx1, idx2;
3439       unsigned int ch;
3440       size_t len;
3441       /* This #include defines a local function!  */
3442 # include <locale/weight.h>
3443       /* Calculate the index for equivalence class.  */
3444       cp = name;
3445       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3446       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3447                                                _NL_COLLATE_WEIGHTMB);
3448       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3449                                                    _NL_COLLATE_EXTRAMB);
3450       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3451                                                 _NL_COLLATE_INDIRECTMB);
3452       idx1 = findidx (&cp);
3453       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3454         /* This isn't a valid character.  */
3455         return REG_ECOLLATE;
3456
3457       /* Build single byte matcing table for this equivalence class.  */
3458       char_buf[1] = (unsigned char) '\0';
3459       len = weights[idx1 & 0xffffff];
3460       for (ch = 0; ch < SBC_MAX; ++ch)
3461         {
3462           char_buf[0] = ch;
3463           cp = char_buf;
3464           idx2 = findidx (&cp);
3465 /*
3466           idx2 = table[ch];
3467 */
3468           if (idx2 == 0)
3469             /* This isn't a valid character.  */
3470             continue;
3471           /* Compare only if the length matches and the collation rule
3472              index is the same.  */
3473           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3474             {
3475               int cnt = 0;
3476
3477               while (cnt <= len &&
3478                      weights[(idx1 & 0xffffff) + 1 + cnt]
3479                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3480                 ++cnt;
3481
3482               if (cnt > len)
3483                 bitset_set (sbcset, ch);
3484             }
3485         }
3486       /* Check whether the array has enough space.  */
3487       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3488         {
3489           /* Not enough, realloc it.  */
3490           /* +1 in case of mbcset->nequiv_classes is 0.  */
3491           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3492           /* Use realloc since the array is NULL if *alloc == 0.  */
3493           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3494                                                    int32_t,
3495                                                    new_equiv_class_alloc);
3496           if (BE (new_equiv_classes == NULL, 0))
3497             return REG_ESPACE;
3498           mbcset->equiv_classes = new_equiv_classes;
3499           *equiv_class_alloc = new_equiv_class_alloc;
3500         }
3501       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3502     }
3503   else
3504 #endif /* _LIBC */
3505     {
3506       if (BE (strlen ((const char *) name) != 1, 0))
3507         return REG_ECOLLATE;
3508       bitset_set (sbcset, *name);
3509     }
3510   return REG_NOERROR;
3511 }
3512
3513   /* Helper function for parse_bracket_exp.
3514      Build the character class which is represented by NAME.
3515      The result are written to MBCSET and SBCSET.
3516      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3517      is a pointer argument sinse we may update it.  */
3518
3519 static reg_errcode_t
3520 #ifdef RE_ENABLE_I18N
3521 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3522                  re_charset_t *mbcset, Idx *char_class_alloc,
3523                  const unsigned char *class_name, reg_syntax_t syntax)
3524 #else /* not RE_ENABLE_I18N */
3525 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3526                  const unsigned char *class_name, reg_syntax_t syntax)
3527 #endif /* not RE_ENABLE_I18N */
3528 {
3529   int i;
3530   const char *name = (const char *) class_name;
3531
3532   /* In case of REG_ICASE "upper" and "lower" match the both of
3533      upper and lower cases.  */
3534   if ((syntax & RE_ICASE)
3535       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3536     name = "alpha";
3537
3538 #ifdef RE_ENABLE_I18N
3539   /* Check the space of the arrays.  */
3540   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3541     {
3542       /* Not enough, realloc it.  */
3543       /* +1 in case of mbcset->nchar_classes is 0.  */
3544       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3545       /* Use realloc since array is NULL if *alloc == 0.  */
3546       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3547                                                new_char_class_alloc);
3548       if (BE (new_char_classes == NULL, 0))
3549         return REG_ESPACE;
3550       mbcset->char_classes = new_char_classes;
3551       *char_class_alloc = new_char_class_alloc;
3552     }
3553   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3554 #endif /* RE_ENABLE_I18N */
3555
3556 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3557   do {                                          \
3558     if (BE (trans != NULL, 0))                  \
3559       {                                         \
3560         for (i = 0; i < SBC_MAX; ++i)           \
3561           if (ctype_func (i))                   \
3562             bitset_set (sbcset, trans[i]);      \
3563       }                                         \
3564     else                                        \
3565       {                                         \
3566         for (i = 0; i < SBC_MAX; ++i)           \
3567           if (ctype_func (i))                   \
3568             bitset_set (sbcset, i);             \
3569       }                                         \
3570   } while (0)
3571
3572   if (strcmp (name, "alnum") == 0)
3573     BUILD_CHARCLASS_LOOP (isalnum);
3574   else if (strcmp (name, "cntrl") == 0)
3575     BUILD_CHARCLASS_LOOP (iscntrl);
3576   else if (strcmp (name, "lower") == 0)
3577     BUILD_CHARCLASS_LOOP (islower);
3578   else if (strcmp (name, "space") == 0)
3579     BUILD_CHARCLASS_LOOP (isspace);
3580   else if (strcmp (name, "alpha") == 0)
3581     BUILD_CHARCLASS_LOOP (isalpha);
3582   else if (strcmp (name, "digit") == 0)
3583     BUILD_CHARCLASS_LOOP (isdigit);
3584   else if (strcmp (name, "print") == 0)
3585     BUILD_CHARCLASS_LOOP (isprint);
3586   else if (strcmp (name, "upper") == 0)
3587     BUILD_CHARCLASS_LOOP (isupper);
3588   else if (strcmp (name, "blank") == 0)
3589     BUILD_CHARCLASS_LOOP (isblank);
3590   else if (strcmp (name, "graph") == 0)
3591     BUILD_CHARCLASS_LOOP (isgraph);
3592   else if (strcmp (name, "punct") == 0)
3593     BUILD_CHARCLASS_LOOP (ispunct);
3594   else if (strcmp (name, "xdigit") == 0)
3595     BUILD_CHARCLASS_LOOP (isxdigit);
3596   else
3597     return REG_ECTYPE;
3598
3599   return REG_NOERROR;
3600 }
3601
3602 static bin_tree_t *
3603 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3604                     const unsigned char *class_name,
3605                     const unsigned char *extra, bool non_match,
3606                     reg_errcode_t *err)
3607 {
3608   re_bitset_ptr_t sbcset;
3609 #ifdef RE_ENABLE_I18N
3610   re_charset_t *mbcset;
3611   Idx alloc = 0;
3612 #endif /* not RE_ENABLE_I18N */
3613   reg_errcode_t ret;
3614   re_token_t br_token;
3615   bin_tree_t *tree;
3616
3617   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3618 #ifdef RE_ENABLE_I18N
3619   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3620 #endif /* RE_ENABLE_I18N */
3621
3622 #ifdef RE_ENABLE_I18N
3623   if (BE (sbcset == NULL || mbcset == NULL, 0))
3624 #else /* not RE_ENABLE_I18N */
3625   if (BE (sbcset == NULL, 0))
3626 #endif /* not RE_ENABLE_I18N */
3627     {
3628       *err = REG_ESPACE;
3629       return NULL;
3630     }
3631
3632   if (non_match)
3633     {
3634 #ifdef RE_ENABLE_I18N
3635       mbcset->non_match = 1;
3636 #endif /* not RE_ENABLE_I18N */
3637     }
3638
3639   /* We don't care the syntax in this case.  */
3640   ret = build_charclass (trans, sbcset,
3641 #ifdef RE_ENABLE_I18N
3642                          mbcset, &alloc,
3643 #endif /* RE_ENABLE_I18N */
3644                          class_name, 0);
3645
3646   if (BE (ret != REG_NOERROR, 0))
3647     {
3648       re_free (sbcset);
3649 #ifdef RE_ENABLE_I18N
3650       free_charset (mbcset);
3651 #endif /* RE_ENABLE_I18N */
3652       *err = ret;
3653       return NULL;
3654     }
3655   /* \w match '_' also.  */
3656   for (; *extra; extra++)
3657     bitset_set (sbcset, *extra);
3658
3659   /* If it is non-matching list.  */
3660   if (non_match)
3661     bitset_not (sbcset);
3662
3663 #ifdef RE_ENABLE_I18N
3664   /* Ensure only single byte characters are set.  */
3665   if (dfa->mb_cur_max > 1)
3666     bitset_mask (sbcset, dfa->sb_char);
3667 #endif
3668
3669   /* Build a tree for simple bracket.  */
3670   br_token.type = SIMPLE_BRACKET;
3671   br_token.opr.sbcset = sbcset;
3672   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3673   if (BE (tree == NULL, 0))
3674     goto build_word_op_espace;
3675
3676 #ifdef RE_ENABLE_I18N
3677   if (dfa->mb_cur_max > 1)
3678     {
3679       bin_tree_t *mbc_tree;
3680       /* Build a tree for complex bracket.  */
3681       br_token.type = COMPLEX_BRACKET;
3682       br_token.opr.mbcset = mbcset;
3683       dfa->has_mb_node = 1;
3684       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3685       if (BE (mbc_tree == NULL, 0))
3686         goto build_word_op_espace;
3687       /* Then join them by ALT node.  */
3688       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3689       if (BE (mbc_tree != NULL, 1))
3690         return tree;
3691     }
3692   else
3693     {
3694       free_charset (mbcset);
3695       return tree;
3696     }
3697 #else /* not RE_ENABLE_I18N */
3698   return tree;
3699 #endif /* not RE_ENABLE_I18N */
3700
3701  build_word_op_espace:
3702   re_free (sbcset);
3703 #ifdef RE_ENABLE_I18N
3704   free_charset (mbcset);
3705 #endif /* RE_ENABLE_I18N */
3706   *err = REG_ESPACE;
3707   return NULL;
3708 }
3709
3710 /* This is intended for the expressions like "a{1,3}".
3711    Fetch a number from `input', and return the number.
3712    Return REG_MISSING if the number field is empty like "{,1}".
3713    Return REG_ERROR if an error occurred.  */
3714
3715 static Idx
3716 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3717 {
3718   Idx num = REG_MISSING;
3719   unsigned char c;
3720   while (1)
3721     {
3722       fetch_token (token, input, syntax);
3723       c = token->opr.c;
3724       if (BE (token->type == END_OF_RE, 0))
3725         return REG_ERROR;
3726       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3727         break;
3728       num = ((token->type != CHARACTER || c < '0' || '9' < c
3729               || num == REG_ERROR)
3730              ? REG_ERROR
3731              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3732       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3733     }
3734   return num;
3735 }
3736 \f
3737 #ifdef RE_ENABLE_I18N
3738 static void
3739 free_charset (re_charset_t *cset)
3740 {
3741   re_free (cset->mbchars);
3742 # ifdef _LIBC
3743   re_free (cset->coll_syms);
3744   re_free (cset->equiv_classes);
3745   re_free (cset->range_starts);
3746   re_free (cset->range_ends);
3747 # endif
3748   re_free (cset->char_classes);
3749   re_free (cset);
3750 }
3751 #endif /* RE_ENABLE_I18N */
3752 \f
3753 /* Functions for binary tree operation.  */
3754
3755 /* Create a tree node.  */
3756
3757 static bin_tree_t *
3758 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3759              re_token_type_t type)
3760 {
3761   re_token_t t;
3762   t.type = type;
3763   return create_token_tree (dfa, left, right, &t);
3764 }
3765
3766 static bin_tree_t *
3767 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3768                    const re_token_t *token)
3769 {
3770   bin_tree_t *tree;
3771   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3772     {
3773       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3774
3775       if (storage == NULL)
3776         return NULL;
3777       storage->next = dfa->str_tree_storage;
3778       dfa->str_tree_storage = storage;
3779       dfa->str_tree_storage_idx = 0;
3780     }
3781   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3782
3783   tree->parent = NULL;
3784   tree->left = left;
3785   tree->right = right;
3786   tree->token = *token;
3787   tree->token.duplicated = 0;
3788   tree->token.opt_subexp = 0;
3789   tree->first = NULL;
3790   tree->next = NULL;
3791   tree->node_idx = REG_MISSING;
3792
3793   if (left != NULL)
3794     left->parent = tree;
3795   if (right != NULL)
3796     right->parent = tree;
3797   return tree;
3798 }
3799
3800 /* Mark the tree SRC as an optional subexpression.
3801    To be called from preorder or postorder.  */
3802
3803 static reg_errcode_t
3804 mark_opt_subexp (void *extra, bin_tree_t *node)
3805 {
3806   Idx idx = (Idx) (long) extra;
3807   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3808     node->token.opt_subexp = 1;
3809
3810   return REG_NOERROR;
3811 }
3812
3813 /* Free the allocated memory inside NODE. */
3814
3815 static void
3816 free_token (re_token_t *node)
3817 {
3818 #ifdef RE_ENABLE_I18N
3819   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3820     free_charset (node->opr.mbcset);
3821   else
3822 #endif /* RE_ENABLE_I18N */
3823     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3824       re_free (node->opr.sbcset);
3825 }
3826
3827 /* Worker function for tree walking.  Free the allocated memory inside NODE
3828    and its children. */
3829
3830 static reg_errcode_t
3831 free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
3832 {
3833   free_token (&node->token);
3834   return REG_NOERROR;
3835 }
3836
3837
3838 /* Duplicate the node SRC, and return new node.  This is a preorder
3839    visit similar to the one implemented by the generic visitor, but
3840    we need more infrastructure to maintain two parallel trees --- so,
3841    it's easier to duplicate.  */
3842
3843 static bin_tree_t *
3844 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3845 {
3846   const bin_tree_t *node;
3847   bin_tree_t *dup_root;
3848   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3849
3850   for (node = root; ; )
3851     {
3852       /* Create a new tree and link it back to the current parent.  */
3853       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3854       if (*p_new == NULL)
3855         return NULL;
3856       (*p_new)->parent = dup_node;
3857       (*p_new)->token.duplicated = 1;
3858       dup_node = *p_new;
3859
3860       /* Go to the left node, or up and to the right.  */
3861       if (node->left)
3862         {
3863           node = node->left;
3864           p_new = &dup_node->left;
3865         }
3866       else
3867         {
3868           const bin_tree_t *prev = NULL;
3869           while (node->right == prev || node->right == NULL)
3870             {
3871               prev = node;
3872               node = node->parent;
3873               dup_node = dup_node->parent;
3874               if (!node)
3875                 return dup_root;
3876             }
3877           node = node->right;
3878           p_new = &dup_node->right;
3879         }
3880     }
3881 }