contrib/grep/lib/regcomp.c

   1 /* -*- buffer-read-only: t -*- vi: set ro: */
   2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
   3 /* Extended regular expression matching and search library.
   4    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
   5    Software Foundation, Inc.
   6    This file is part of the GNU C Library.
   7    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3, or (at your option)
  12    any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License along
  20    with this program; if not, write to the Free Software Foundation,
  21    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  22
  23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  24                                           size_t length, reg_syntax_t syntax);
  25 static void re_compile_fastmap_iter (regex_t *bufp,
  26                                      const re_dfastate_t *init_state,
  27                                      char *fastmap);
  28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  29 #ifdef RE_ENABLE_I18N
  30 static void free_charset (re_charset_t *cset);
  31 #endif /* RE_ENABLE_I18N */
  32 static void free_workarea_compile (regex_t *preg);
  33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  34 #ifdef RE_ENABLE_I18N
  35 static void optimize_utf8 (re_dfa_t *dfa);
  36 #endif
  37 static reg_errcode_t analyze (regex_t *preg);
  38 static reg_errcode_t preorder (bin_tree_t *root,
  39                                reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                void *extra);
  41 static reg_errcode_t postorder (bin_tree_t *root,
  42                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                 void *extra);
  44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  47                                  bin_tree_t *node);
  48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  53                                    unsigned int constraint);
  54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  56                                          Idx node, bool root);
  57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  58 static Idx fetch_number (re_string_t *input, re_token_t *token,
  59                          reg_syntax_t syntax);
  60 static int peek_token (re_token_t *token, re_string_t *input,
  61                         reg_syntax_t syntax) internal_function;
  62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  63                           reg_syntax_t syntax, reg_errcode_t *err);
  64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  65                                   re_token_t *token, reg_syntax_t syntax,
  66                                   Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  68                                  re_token_t *token, reg_syntax_t syntax,
  69                                  Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  71                                      re_token_t *token, reg_syntax_t syntax,
  72                                      Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  74                                   re_token_t *token, reg_syntax_t syntax,
  75                                   Idx nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  77                                  re_dfa_t *dfa, re_token_t *token,
  78                                  reg_syntax_t syntax, reg_errcode_t *err);
  79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  80                                       re_token_t *token, reg_syntax_t syntax,
  81                                       reg_errcode_t *err);
  82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  83                                             re_string_t *regexp,
  84                                             re_token_t *token, int token_len,
  85                                             re_dfa_t *dfa,
  86                                             reg_syntax_t syntax,
  87                                             bool accept_hyphen);
  88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  89                                           re_string_t *regexp,
  90                                           re_token_t *token);
  91 #ifdef RE_ENABLE_I18N
  92 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  93                                         re_charset_t *mbcset,
  94                                         Idx *equiv_class_alloc,
  95                                         const unsigned char *name);
  96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  97                                       bitset_t sbcset,
  98                                       re_charset_t *mbcset,
  99                                       Idx *char_class_alloc,
 100                                       const unsigned char *class_name,
 101                                       reg_syntax_t syntax);
 102 #else  /* not RE_ENABLE_I18N */
 103 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 104                                         const unsigned char *name);
 105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 106                                       bitset_t sbcset,
 107                                       const unsigned char *class_name,
 108                                       reg_syntax_t syntax);
 109 #endif /* not RE_ENABLE_I18N */
 110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 111                                        RE_TRANSLATE_TYPE trans,
 112                                        const unsigned char *class_name,
 113                                        const unsigned char *extra,
 114                                        bool non_match, reg_errcode_t *err);
 115 static bin_tree_t *create_tree (re_dfa_t *dfa,
 116                                 bin_tree_t *left, bin_tree_t *right,
 117                                 re_token_type_t type);
 118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 119                                       bin_tree_t *left, bin_tree_t *right,
 120                                       const re_token_t *token);
 121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 122 static void free_token (re_token_t *node);
 123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 125 \f
 126 /* This table gives an error message for each of the error codes listed
 127    in regex.h.  Obviously the order here has to be same as there.
 128    POSIX doesn't require that we do anything for REG_NOERROR,
 129    but why not be nice?  */
 130
 131 static const char __re_error_msgid[] =
 132   {
 133 #define REG_NOERROR_IDX 0
 134     gettext_noop ("Success")    /* REG_NOERROR */
 135     "\0"
 136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 137     gettext_noop ("No match")   /* REG_NOMATCH */
 138     "\0"
 139 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 140     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 141     "\0"
 142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 143     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 144     "\0"
 145 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 146     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 147     "\0"
 148 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 149     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 150     "\0"
 151 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 152     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 153     "\0"
 154 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 155     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 156     "\0"
 157 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 158     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 159     "\0"
 160 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 161     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 162     "\0"
 163 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 164     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 165     "\0"
 166 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 167     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 168     "\0"
 169 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 170     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 171     "\0"
 172 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 173     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 174     "\0"
 175 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 176     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 177     "\0"
 178 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 179     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 180     "\0"
 181 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 182     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 183   };
 184
 185 static const size_t __re_error_msgid_idx[] =
 186   {
 187     REG_NOERROR_IDX,
 188     REG_NOMATCH_IDX,
 189     REG_BADPAT_IDX,
 190     REG_ECOLLATE_IDX,
 191     REG_ECTYPE_IDX,
 192     REG_EESCAPE_IDX,
 193     REG_ESUBREG_IDX,
 194     REG_EBRACK_IDX,
 195     REG_EPAREN_IDX,
 196     REG_EBRACE_IDX,
 197     REG_BADBR_IDX,
 198     REG_ERANGE_IDX,
 199     REG_ESPACE_IDX,
 200     REG_BADRPT_IDX,
 201     REG_EEND_IDX,
 202     REG_ESIZE_IDX,
 203     REG_ERPAREN_IDX
 204   };
 205 \f
 206 /* Entry points for GNU code.  */
 207
 208 /* re_compile_pattern is the GNU regular expression compiler: it
 209    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 210    Returns 0 if the pattern was valid, otherwise an error string.
 211
 212    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 213    are set in BUFP on entry.  */
 214
 215 #ifdef _LIBC
 216 const char *
 217 re_compile_pattern (pattern, length, bufp)
 218     const char *pattern;
 219     size_t length;
 220     struct re_pattern_buffer *bufp;
 221 #else /* size_t might promote */
 222 const char *
 223 re_compile_pattern (const char *pattern, size_t length,
 224                     struct re_pattern_buffer *bufp)
 225 #endif
 226 {
 227   reg_errcode_t ret;
 228
 229   /* And GNU code determines whether or not to get register information
 230      by passing null for the REGS argument to re_match, etc., not by
 231      setting no_sub, unless RE_NO_SUB is set.  */
 232   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 233
 234   /* Match anchors at newline.  */
 235   bufp->newline_anchor = 1;
 236
 237   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 238
 239   if (!ret)
 240     return NULL;
 241   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 242 }
 243 #ifdef _LIBC
 244 weak_alias (__re_compile_pattern, re_compile_pattern)
 245 #endif
 246
 247 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 248    also be assigned to arbitrarily: each pattern buffer stores its own
 249    syntax, so it can be changed between regex compilations.  */
 250 /* This has no initializer because initialized variables in Emacs
 251    become read-only after dumping.  */
 252 reg_syntax_t re_syntax_options;
 253
 254
 255 /* Specify the precise syntax of regexps for compilation.  This provides
 256    for compatibility for various utilities which historically have
 257    different, incompatible syntaxes.
 258
 259    The argument SYNTAX is a bit mask comprised of the various bits
 260    defined in regex.h.  We return the old syntax.  */
 261
 262 reg_syntax_t
 263 re_set_syntax (syntax)
 264     reg_syntax_t syntax;
 265 {
 266   reg_syntax_t ret = re_syntax_options;
 267
 268   re_syntax_options = syntax;
 269   return ret;
 270 }
 271 #ifdef _LIBC
 272 weak_alias (__re_set_syntax, re_set_syntax)
 273 #endif
 274
 275 int
 276 re_compile_fastmap (bufp)
 277     struct re_pattern_buffer *bufp;
 278 {
 279   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 280   char *fastmap = bufp->fastmap;
 281
 282   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 283   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 284   if (dfa->init_state != dfa->init_state_word)
 285     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 286   if (dfa->init_state != dfa->init_state_nl)
 287     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 288   if (dfa->init_state != dfa->init_state_begbuf)
 289     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 290   bufp->fastmap_accurate = 1;
 291   return 0;
 292 }
 293 #ifdef _LIBC
 294 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 295 #endif
 296
 297 static inline void
 298 __attribute ((always_inline))
 299 re_set_fastmap (char *fastmap, bool icase, int ch)
 300 {
 301   fastmap[ch] = 1;
 302   if (icase)
 303     fastmap[tolower (ch)] = 1;
 304 }
 305
 306 /* Helper function for re_compile_fastmap.
 307    Compile fastmap for the initial_state INIT_STATE.  */
 308
 309 static void
 310 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 311                          char *fastmap)
 312 {
 313   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 314   Idx node_cnt;
 315   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 316   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 317     {
 318       Idx node = init_state->nodes.elems[node_cnt];
 319       re_token_type_t type = dfa->nodes[node].type;
 320
 321       if (type == CHARACTER)
 322         {
 323           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 324 #ifdef RE_ENABLE_I18N
 325           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 326             {
 327               unsigned char buf[MB_LEN_MAX];
 328               unsigned char *p;
 329               wchar_t wc;
 330               mbstate_t state;
 331
 332               p = buf;
 333               *p++ = dfa->nodes[node].opr.c;
 334               while (++node < dfa->nodes_len
 335                      && dfa->nodes[node].type == CHARACTER
 336                      && dfa->nodes[node].mb_partial)
 337                 *p++ = dfa->nodes[node].opr.c;
 338               memset (&state, '\0', sizeof (state));
 339               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 340                              &state) == p - buf
 341                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 342                       != (size_t) -1))
 343                 re_set_fastmap (fastmap, false, buf[0]);
 344             }
 345 #endif
 346         }
 347       else if (type == SIMPLE_BRACKET)
 348         {
 349           int i, ch;
 350           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 351             {
 352               int j;
 353               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 354               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 355                 if (w & ((bitset_word_t) 1 << j))
 356                   re_set_fastmap (fastmap, icase, ch);
 357             }
 358         }
 359 #ifdef RE_ENABLE_I18N
 360       else if (type == COMPLEX_BRACKET)
 361         {
 362           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 363           Idx i;
 364
 365 # ifdef _LIBC
 366           /* See if we have to try all bytes which start multiple collation
 367              elements.
 368              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 369                   collation element, and don't catch 'b' since 'b' is
 370                   the only collation element which starts from 'b' (and
 371                   it is caught by SIMPLE_BRACKET).  */
 372               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 373                   && (cset->ncoll_syms || cset->nranges))
 374                 {
 375                   const int32_t *table = (const int32_t *)
 376                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 377                   for (i = 0; i < SBC_MAX; ++i)
 378                     if (table[i] < 0)
 379                       re_set_fastmap (fastmap, icase, i);
 380                 }
 381 # endif /* _LIBC */
 382
 383           /* See if we have to start the match at all multibyte characters,
 384              i.e. where we would not find an invalid sequence.  This only
 385              applies to multibyte character sets; for single byte character
 386              sets, the SIMPLE_BRACKET again suffices.  */
 387           if (dfa->mb_cur_max > 1
 388               && (cset->nchar_classes || cset->non_match || cset->nranges
 389 # ifdef _LIBC
 390                   || cset->nequiv_classes
 391 # endif /* _LIBC */
 392                  ))
 393             {
 394               unsigned char c = 0;
 395               do
 396                 {
 397                   mbstate_t mbs;
 398                   memset (&mbs, 0, sizeof (mbs));
 399                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 400                     re_set_fastmap (fastmap, false, (int) c);
 401                 }
 402               while (++c != 0);
 403             }
 404
 405           else
 406             {
 407               /* ... Else catch all bytes which can start the mbchars.  */
 408               for (i = 0; i < cset->nmbchars; ++i)
 409                 {
 410                   char buf[256];
 411                   mbstate_t state;
 412                   memset (&state, '\0', sizeof (state));
 413                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 414                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 415                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 416                     {
 417                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 418                           != (size_t) -1)
 419                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 420                     }
 421                 }
 422             }
 423         }
 424 #endif /* RE_ENABLE_I18N */
 425       else if (type == OP_PERIOD
 426 #ifdef RE_ENABLE_I18N
 427                || type == OP_UTF8_PERIOD
 428 #endif /* RE_ENABLE_I18N */
 429                || type == END_OF_RE)
 430         {
 431           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 432           if (type == END_OF_RE)
 433             bufp->can_be_null = 1;
 434           return;
 435         }
 436     }
 437 }
 438 \f
 439 /* Entry point for POSIX code.  */
 440 /* regcomp takes a regular expression as a string and compiles it.
 441
 442    PREG is a regex_t *.  We do not expect any fields to be initialized,
 443    since POSIX says we shouldn't.  Thus, we set
 444
 445      `buffer' to the compiled pattern;
 446      `used' to the length of the compiled pattern;
 447      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 448        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 449        RE_SYNTAX_POSIX_BASIC;
 450      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 451      `fastmap' to an allocated space for the fastmap;
 452      `fastmap_accurate' to zero;
 453      `re_nsub' to the number of subexpressions in PATTERN.
 454
 455    PATTERN is the address of the pattern string.
 456
 457    CFLAGS is a series of bits which affect compilation.
 458
 459      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 460      use POSIX basic syntax.
 461
 462      If REG_NEWLINE is set, then . and [^...] don't match newline.
 463      Also, regexec will try a match beginning after every newline.
 464
 465      If REG_ICASE is set, then we considers upper- and lowercase
 466      versions of letters to be equivalent when matching.
 467
 468      If REG_NOSUB is set, then when PREG is passed to regexec, that
 469      routine will report only success or failure, and nothing about the
 470      registers.
 471
 472    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 473    the return codes and their meanings.)  */
 474
 475 int
 476 regcomp (preg, pattern, cflags)
 477     regex_t *_Restrict_ preg;
 478     const char *_Restrict_ pattern;
 479     int cflags;
 480 {
 481   reg_errcode_t ret;
 482   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 483                          : RE_SYNTAX_POSIX_BASIC);
 484
 485   preg->buffer = NULL;
 486   preg->allocated = 0;
 487   preg->used = 0;
 488
 489   /* Try to allocate space for the fastmap.  */
 490   preg->fastmap = re_malloc (char, SBC_MAX);
 491   if (BE (preg->fastmap == NULL, 0))
 492     return REG_ESPACE;
 493
 494   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 495
 496   /* If REG_NEWLINE is set, newlines are treated differently.  */
 497   if (cflags & REG_NEWLINE)
 498     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 499       syntax &= ~RE_DOT_NEWLINE;
 500       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 501       /* It also changes the matching behavior.  */
 502       preg->newline_anchor = 1;
 503     }
 504   else
 505     preg->newline_anchor = 0;
 506   preg->no_sub = !!(cflags & REG_NOSUB);
 507   preg->translate = NULL;
 508
 509   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 510
 511   /* POSIX doesn't distinguish between an unmatched open-group and an
 512      unmatched close-group: both are REG_EPAREN.  */
 513   if (ret == REG_ERPAREN)
 514     ret = REG_EPAREN;
 515
 516   /* We have already checked preg->fastmap != NULL.  */
 517   if (BE (ret == REG_NOERROR, 1))
 518     /* Compute the fastmap now, since regexec cannot modify the pattern
 519        buffer.  This function never fails in this implementation.  */
 520     (void) re_compile_fastmap (preg);
 521   else
 522     {
 523       /* Some error occurred while compiling the expression.  */
 524       re_free (preg->fastmap);
 525       preg->fastmap = NULL;
 526     }
 527
 528   return (int) ret;
 529 }
 530 #ifdef _LIBC
 531 weak_alias (__regcomp, regcomp)
 532 #endif
 533
 534 /* Returns a message corresponding to an error code, ERRCODE, returned
 535    from either regcomp or regexec.   We don't use PREG here.  */
 536
 537 #ifdef _LIBC
 538 size_t
 539 regerror (errcode, preg, errbuf, errbuf_size)
 540     int errcode;
 541     const regex_t *_Restrict_ preg;
 542     char *_Restrict_ errbuf;
 543     size_t errbuf_size;
 544 #else /* size_t might promote */
 545 size_t
 546 regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_,
 547           char *_Restrict_ errbuf, size_t errbuf_size)
 548 #endif
 549 {
 550   const char *msg;
 551   size_t msg_size;
 552
 553   if (BE (errcode < 0
 554           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 555                                / sizeof (__re_error_msgid_idx[0])), 0))
 556     /* Only error codes returned by the rest of the code should be passed
 557        to this routine.  If we are given anything else, or if other regex
 558        code generates an invalid error code, then the program has a bug.
 559        Dump core so we can fix it.  */
 560     abort ();
 561
 562   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 563
 564   msg_size = strlen (msg) + 1; /* Includes the null.  */
 565
 566   if (BE (errbuf_size != 0, 1))
 567     {
 568       size_t cpy_size = msg_size;
 569       if (BE (msg_size > errbuf_size, 0))
 570         {
 571           cpy_size = errbuf_size - 1;
 572           errbuf[cpy_size] = '\0';
 573         }
 574       memcpy (errbuf, msg, cpy_size);
 575     }
 576
 577   return msg_size;
 578 }
 579 #ifdef _LIBC
 580 weak_alias (__regerror, regerror)
 581 #endif
 582
 583
 584 #ifdef RE_ENABLE_I18N
 585 /* This static array is used for the map to single-byte characters when
 586    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 587    it the same all the time.  UTF-8 is the preferred encoding so this is
 588    a worthwhile optimization.  */
 589 static const bitset_t utf8_sb_map =
 590 {
 591   /* Set the first 128 bits.  */
 592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 593 #  error "bitset_word_t is narrower than 32 bits"
 594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 595   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 597   BITSET_WORD_MAX, BITSET_WORD_MAX,
 598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 599   BITSET_WORD_MAX,
 600 # endif
 601   (BITSET_WORD_MAX
 602    >> (SBC_MAX % BITSET_WORD_BITS == 0
 603        ? 0
 604        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 605 };
 606 #endif
 607
 608
 609 static void
 610 free_dfa_content (re_dfa_t *dfa)
 611 {
 612   Idx i, j;
 613
 614   if (dfa->nodes)
 615     for (i = 0; i < dfa->nodes_len; ++i)
 616       free_token (dfa->nodes + i);
 617   re_free (dfa->nexts);
 618   for (i = 0; i < dfa->nodes_len; ++i)
 619     {
 620       if (dfa->eclosures != NULL)
 621         re_node_set_free (dfa->eclosures + i);
 622       if (dfa->inveclosures != NULL)
 623         re_node_set_free (dfa->inveclosures + i);
 624       if (dfa->edests != NULL)
 625         re_node_set_free (dfa->edests + i);
 626     }
 627   re_free (dfa->edests);
 628   re_free (dfa->eclosures);
 629   re_free (dfa->inveclosures);
 630   re_free (dfa->nodes);
 631
 632   if (dfa->state_table)
 633     for (i = 0; i <= dfa->state_hash_mask; ++i)
 634       {
 635         struct re_state_table_entry *entry = dfa->state_table + i;
 636         for (j = 0; j < entry->num; ++j)
 637           {
 638             re_dfastate_t *state = entry->array[j];
 639             free_state (state);
 640           }
 641         re_free (entry->array);
 642       }
 643   re_free (dfa->state_table);
 644 #ifdef RE_ENABLE_I18N
 645   if (dfa->sb_char != utf8_sb_map)
 646     re_free (dfa->sb_char);
 647 #endif
 648   re_free (dfa->subexp_map);
 649 #ifdef DEBUG
 650   re_free (dfa->re_str);
 651 #endif
 652
 653   re_free (dfa);
 654 }
 655
 656
 657 /* Free dynamically allocated space used by PREG.  */
 658
 659 void
 660 regfree (preg)
 661     regex_t *preg;
 662 {
 663   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 664   if (BE (dfa != NULL, 1))
 665     free_dfa_content (dfa);
 666   preg->buffer = NULL;
 667   preg->allocated = 0;
 668
 669   re_free (preg->fastmap);
 670   preg->fastmap = NULL;
 671
 672   re_free (preg->translate);
 673   preg->translate = NULL;
 674 }
 675 #ifdef _LIBC
 676 weak_alias (__regfree, regfree)
 677 #endif
 678 \f
 679 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 680    them unless specifically requested.  */
 681
 682 #if defined _REGEX_RE_COMP || defined _LIBC
 683
 684 /* BSD has one and only one pattern buffer.  */
 685 static struct re_pattern_buffer re_comp_buf;
 686
 687 char *
 688 # ifdef _LIBC
 689 /* Make these definitions weak in libc, so POSIX programs can redefine
 690    these names if they don't use our functions, and still use
 691    regcomp/regexec above without link errors.  */
 692 weak_function
 693 # endif
 694 re_comp (s)
 695      const char *s;
 696 {
 697   reg_errcode_t ret;
 698   char *fastmap;
 699
 700   if (!s)
 701     {
 702       if (!re_comp_buf.buffer)
 703         return gettext ("No previous regular expression");
 704       return 0;
 705     }
 706
 707   if (re_comp_buf.buffer)
 708     {
 709       fastmap = re_comp_buf.fastmap;
 710       re_comp_buf.fastmap = NULL;
 711       __regfree (&re_comp_buf);
 712       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 713       re_comp_buf.fastmap = fastmap;
 714     }
 715
 716   if (re_comp_buf.fastmap == NULL)
 717     {
 718       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 719       if (re_comp_buf.fastmap == NULL)
 720         return (char *) gettext (__re_error_msgid
 721                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 722     }
 723
 724   /* Since `re_exec' always passes NULL for the `regs' argument, we
 725      don't need to initialize the pattern buffer fields which affect it.  */
 726
 727   /* Match anchors at newlines.  */
 728   re_comp_buf.newline_anchor = 1;
 729
 730   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 731
 732   if (!ret)
 733     return NULL;
 734
 735   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 736   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 737 }
 738
 739 #ifdef _LIBC
 740 libc_freeres_fn (free_mem)
 741 {
 742   __regfree (&re_comp_buf);
 743 }
 744 #endif
 745
 746 #endif /* _REGEX_RE_COMP */
 747 \f
 748 /* Internal entry point.
 749    Compile the regular expression PATTERN, whose length is LENGTH.
 750    SYNTAX indicate regular expression's syntax.  */
 751
 752 static reg_errcode_t
 753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 754                      reg_syntax_t syntax)
 755 {
 756   reg_errcode_t err = REG_NOERROR;
 757   re_dfa_t *dfa;
 758   re_string_t regexp;
 759
 760   /* Initialize the pattern buffer.  */
 761   preg->fastmap_accurate = 0;
 762   preg->syntax = syntax;
 763   preg->not_bol = preg->not_eol = 0;
 764   preg->used = 0;
 765   preg->re_nsub = 0;
 766   preg->can_be_null = 0;
 767   preg->regs_allocated = REGS_UNALLOCATED;
 768
 769   /* Initialize the dfa.  */
 770   dfa = (re_dfa_t *) preg->buffer;
 771   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 772     {
 773       /* If zero allocated, but buffer is non-null, try to realloc
 774          enough space.  This loses if buffer's address is bogus, but
 775          that is the user's responsibility.  If ->buffer is NULL this
 776          is a simple allocation.  */
 777       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 778       if (dfa == NULL)
 779         return REG_ESPACE;
 780       preg->allocated = sizeof (re_dfa_t);
 781       preg->buffer = (unsigned char *) dfa;
 782     }
 783   preg->used = sizeof (re_dfa_t);
 784
 785   err = init_dfa (dfa, length);
 786   if (BE (err != REG_NOERROR, 0))
 787     {
 788       free_dfa_content (dfa);
 789       preg->buffer = NULL;
 790       preg->allocated = 0;
 791       return err;
 792     }
 793 #ifdef DEBUG
 794   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 795   dfa->re_str = re_malloc (char, length + 1);
 796   strncpy (dfa->re_str, pattern, length + 1);
 797 #endif
 798
 799   __libc_lock_init (dfa->lock);
 800
 801   err = re_string_construct (&regexp, pattern, length, preg->translate,
 802                              (syntax & RE_ICASE) != 0, dfa);
 803   if (BE (err != REG_NOERROR, 0))
 804     {
 805     re_compile_internal_free_return:
 806       free_workarea_compile (preg);
 807       re_string_destruct (&regexp);
 808       free_dfa_content (dfa);
 809       preg->buffer = NULL;
 810       preg->allocated = 0;
 811       return err;
 812     }
 813
 814   /* Parse the regular expression, and build a structure tree.  */
 815   preg->re_nsub = 0;
 816   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 817   if (BE (dfa->str_tree == NULL, 0))
 818     goto re_compile_internal_free_return;
 819
 820   /* Analyze the tree and create the nfa.  */
 821   err = analyze (preg);
 822   if (BE (err != REG_NOERROR, 0))
 823     goto re_compile_internal_free_return;
 824
 825 #ifdef RE_ENABLE_I18N
 826   /* If possible, do searching in single byte encoding to speed things up.  */
 827   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 828     optimize_utf8 (dfa);
 829 #endif
 830
 831   /* Then create the initial state of the dfa.  */
 832   err = create_initial_state (dfa);
 833
 834   /* Release work areas.  */
 835   free_workarea_compile (preg);
 836   re_string_destruct (&regexp);
 837
 838   if (BE (err != REG_NOERROR, 0))
 839     {
 840       free_dfa_content (dfa);
 841       preg->buffer = NULL;
 842       preg->allocated = 0;
 843     }
 844
 845   return err;
 846 }
 847
 848 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 849    as the initial length of some arrays.  */
 850
 851 static reg_errcode_t
 852 init_dfa (re_dfa_t *dfa, size_t pat_len)
 853 {
 854   __re_size_t table_size;
 855 #ifndef _LIBC
 856   char *codeset_name;
 857 #endif
 858 #ifdef RE_ENABLE_I18N
 859   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 860 #else
 861   size_t max_i18n_object_size = 0;
 862 #endif
 863   size_t max_object_size =
 864     MAX (sizeof (struct re_state_table_entry),
 865          MAX (sizeof (re_token_t),
 866               MAX (sizeof (re_node_set),
 867                    MAX (sizeof (regmatch_t),
 868                         max_i18n_object_size))));
 869
 870   memset (dfa, '\0', sizeof (re_dfa_t));
 871
 872   /* Force allocation of str_tree_storage the first time.  */
 873   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 874
 875   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 876      calculation below, and for similar doubling calculations
 877      elsewhere.  And it's <= rather than <, because some of the
 878      doubling calculations add 1 afterwards.  */
 879   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 880     return REG_ESPACE;
 881
 882   dfa->nodes_alloc = pat_len + 1;
 883   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 884
 885   /*  table_size = 2 ^ ceil(log pat_len) */
 886   for (table_size = 1; ; table_size <<= 1)
 887     if (table_size > pat_len)
 888       break;
 889
 890   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 891   dfa->state_hash_mask = table_size - 1;
 892
 893   dfa->mb_cur_max = MB_CUR_MAX;
 894 #ifdef _LIBC
 895   if (dfa->mb_cur_max == 6
 896       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 897     dfa->is_utf8 = 1;
 898   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 899                        != 0);
 900 #else
 901   codeset_name = nl_langinfo (CODESET);
 902   if (strcasecmp (codeset_name, "UTF-8") == 0
 903       || strcasecmp (codeset_name, "UTF8") == 0)
 904     dfa->is_utf8 = 1;
 905
 906   /* We check exhaustively in the loop below if this charset is a
 907      superset of ASCII.  */
 908   dfa->map_notascii = 0;
 909 #endif
 910
 911 #ifdef RE_ENABLE_I18N
 912   if (dfa->mb_cur_max > 1)
 913     {
 914       if (dfa->is_utf8)
 915         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 916       else
 917         {
 918           int i, j, ch;
 919
 920           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 921           if (BE (dfa->sb_char == NULL, 0))
 922             return REG_ESPACE;
 923
 924           /* Set the bits corresponding to single byte chars.  */
 925           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 926             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 927               {
 928                 wint_t wch = __btowc (ch);
 929                 if (wch != WEOF)
 930                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 931 # ifndef _LIBC
 932                 if (isascii (ch) && wch != ch)
 933                   dfa->map_notascii = 1;
 934 # endif
 935               }
 936         }
 937     }
 938 #endif
 939
 940   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 941     return REG_ESPACE;
 942   return REG_NOERROR;
 943 }
 944
 945 /* Initialize WORD_CHAR table, which indicate which character is
 946    "word".  In this case "word" means that it is the word construction
 947    character used by some operators like "\<", "\>", etc.  */
 948
 949 static void
 950 internal_function
 951 init_word_char (re_dfa_t *dfa)
 952 {
 953   int i, j, ch;
 954   dfa->word_ops_used = 1;
 955   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 956     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 957       if (isalnum (ch) || ch == '_')
 958         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 959 }
 960
 961 /* Free the work area which are only used while compiling.  */
 962
 963 static void
 964 free_workarea_compile (regex_t *preg)
 965 {
 966   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 967   bin_tree_storage_t *storage, *next;
 968   for (storage = dfa->str_tree_storage; storage; storage = next)
 969     {
 970       next = storage->next;
 971       re_free (storage);
 972     }
 973   dfa->str_tree_storage = NULL;
 974   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 975   dfa->str_tree = NULL;
 976   re_free (dfa->org_indices);
 977   dfa->org_indices = NULL;
 978 }
 979
 980 /* Create initial states for all contexts.  */
 981
 982 static reg_errcode_t
 983 create_initial_state (re_dfa_t *dfa)
 984 {
 985   Idx first, i;
 986   reg_errcode_t err;
 987   re_node_set init_nodes;
 988
 989   /* Initial states have the epsilon closure of the node which is
 990      the first node of the regular expression.  */
 991   first = dfa->str_tree->first->node_idx;
 992   dfa->init_node = first;
 993   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 994   if (BE (err != REG_NOERROR, 0))
 995     return err;
 996
 997   /* The back-references which are in initial states can epsilon transit,
 998      since in this case all of the subexpressions can be null.
 999      Then we add epsilon closures of the nodes which are the next nodes of
1000      the back-references.  */
1001   if (dfa->nbackref > 0)
1002     for (i = 0; i < init_nodes.nelem; ++i)
1003       {
1004         Idx node_idx = init_nodes.elems[i];
1005         re_token_type_t type = dfa->nodes[node_idx].type;
1006
1007         Idx clexp_idx;
1008         if (type != OP_BACK_REF)
1009           continue;
1010         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1011           {
1012             re_token_t *clexp_node;
1013             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1014             if (clexp_node->type == OP_CLOSE_SUBEXP
1015                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1016               break;
1017           }
1018         if (clexp_idx == init_nodes.nelem)
1019           continue;
1020
1021         if (type == OP_BACK_REF)
1022           {
1023             Idx dest_idx = dfa->edests[node_idx].elems[0];
1024             if (!re_node_set_contains (&init_nodes, dest_idx))
1025               {
1026                 reg_errcode_t merge_err
1027                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1028                 if (merge_err != REG_NOERROR)
1029                   return merge_err;
1030                 i = 0;
1031               }
1032           }
1033       }
1034
1035   /* It must be the first time to invoke acquire_state.  */
1036   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1037   /* We don't check ERR here, since the initial state must not be NULL.  */
1038   if (BE (dfa->init_state == NULL, 0))
1039     return err;
1040   if (dfa->init_state->has_constraint)
1041     {
1042       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1043                                                        CONTEXT_WORD);
1044       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1045                                                      CONTEXT_NEWLINE);
1046       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1047                                                          &init_nodes,
1048                                                          CONTEXT_NEWLINE
1049                                                          | CONTEXT_BEGBUF);
1050       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1051               || dfa->init_state_begbuf == NULL, 0))
1052         return err;
1053     }
1054   else
1055     dfa->init_state_word = dfa->init_state_nl
1056       = dfa->init_state_begbuf = dfa->init_state;
1057
1058   re_node_set_free (&init_nodes);
1059   return REG_NOERROR;
1060 }
1061 \f
1062 #ifdef RE_ENABLE_I18N
1063 /* If it is possible to do searching in single byte encoding instead of UTF-8
1064    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1065    DFA nodes where needed.  */
1066
1067 static void
1068 optimize_utf8 (re_dfa_t *dfa)
1069 {
1070   Idx node;
1071   int i;
1072   bool mb_chars = false;
1073   bool has_period = false;
1074
1075   for (node = 0; node < dfa->nodes_len; ++node)
1076     switch (dfa->nodes[node].type)
1077       {
1078       case CHARACTER:
1079         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1080           mb_chars = true;
1081         break;
1082       case ANCHOR:
1083         switch (dfa->nodes[node].opr.ctx_type)
1084           {
1085           case LINE_FIRST:
1086           case LINE_LAST:
1087           case BUF_FIRST:
1088           case BUF_LAST:
1089             break;
1090           default:
1091             /* Word anchors etc. cannot be handled.  It's okay to test
1092                opr.ctx_type since constraints (for all DFA nodes) are
1093                created by ORing one or more opr.ctx_type values.  */
1094             return;
1095           }
1096         break;
1097       case OP_PERIOD:
1098         has_period = true;
1099         break;
1100       case OP_BACK_REF:
1101       case OP_ALT:
1102       case END_OF_RE:
1103       case OP_DUP_ASTERISK:
1104       case OP_OPEN_SUBEXP:
1105       case OP_CLOSE_SUBEXP:
1106         break;
1107       case COMPLEX_BRACKET:
1108         return;
1109       case SIMPLE_BRACKET:
1110         /* Just double check.  */
1111         {
1112           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1113                         ? 0
1114                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1115           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1116             {
1117               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1118                 return;
1119               rshift = 0;
1120             }
1121         }
1122         break;
1123       default:
1124         abort ();
1125       }
1126
1127   if (mb_chars || has_period)
1128     for (node = 0; node < dfa->nodes_len; ++node)
1129       {
1130         if (dfa->nodes[node].type == CHARACTER
1131             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1132           dfa->nodes[node].mb_partial = 0;
1133         else if (dfa->nodes[node].type == OP_PERIOD)
1134           dfa->nodes[node].type = OP_UTF8_PERIOD;
1135       }
1136
1137   /* The search can be in single byte locale.  */
1138   dfa->mb_cur_max = 1;
1139   dfa->is_utf8 = 0;
1140   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1141 }
1142 #endif
1143 \f
1144 /* Analyze the structure tree, and calculate "first", "next", "edest",
1145    "eclosure", and "inveclosure".  */
1146
1147 static reg_errcode_t
1148 analyze (regex_t *preg)
1149 {
1150   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1151   reg_errcode_t ret;
1152
1153   /* Allocate arrays.  */
1154   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1155   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1156   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1157   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1158   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1159           || dfa->eclosures == NULL, 0))
1160     return REG_ESPACE;
1161
1162   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1163   if (dfa->subexp_map != NULL)
1164     {
1165       Idx i;
1166       for (i = 0; i < preg->re_nsub; i++)
1167         dfa->subexp_map[i] = i;
1168       preorder (dfa->str_tree, optimize_subexps, dfa);
1169       for (i = 0; i < preg->re_nsub; i++)
1170         if (dfa->subexp_map[i] != i)
1171           break;
1172       if (i == preg->re_nsub)
1173         {
1174           free (dfa->subexp_map);
1175           dfa->subexp_map = NULL;
1176         }
1177     }
1178
1179   ret = postorder (dfa->str_tree, lower_subexps, preg);
1180   if (BE (ret != REG_NOERROR, 0))
1181     return ret;
1182   ret = postorder (dfa->str_tree, calc_first, dfa);
1183   if (BE (ret != REG_NOERROR, 0))
1184     return ret;
1185   preorder (dfa->str_tree, calc_next, dfa);
1186   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1187   if (BE (ret != REG_NOERROR, 0))
1188     return ret;
1189   ret = calc_eclosure (dfa);
1190   if (BE (ret != REG_NOERROR, 0))
1191     return ret;
1192
1193   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1194      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1195   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1196       || dfa->nbackref)
1197     {
1198       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1199       if (BE (dfa->inveclosures == NULL, 0))
1200         return REG_ESPACE;
1201       ret = calc_inveclosure (dfa);
1202     }
1203
1204   return ret;
1205 }
1206
1207 /* Our parse trees are very unbalanced, so we cannot use a stack to
1208    implement parse tree visits.  Instead, we use parent pointers and
1209    some hairy code in these two functions.  */
1210 static reg_errcode_t
1211 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1212            void *extra)
1213 {
1214   bin_tree_t *node, *prev;
1215
1216   for (node = root; ; )
1217     {
1218       /* Descend down the tree, preferably to the left (or to the right
1219          if that's the only child).  */
1220       while (node->left || node->right)
1221         if (node->left)
1222           node = node->left;
1223         else
1224           node = node->right;
1225
1226       do
1227         {
1228           reg_errcode_t err = fn (extra, node);
1229           if (BE (err != REG_NOERROR, 0))
1230             return err;
1231           if (node->parent == NULL)
1232             return REG_NOERROR;
1233           prev = node;
1234           node = node->parent;
1235         }
1236       /* Go up while we have a node that is reached from the right.  */
1237       while (node->right == prev || node->right == NULL);
1238       node = node->right;
1239     }
1240 }
1241
1242 static reg_errcode_t
1243 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1244           void *extra)
1245 {
1246   bin_tree_t *node;
1247
1248   for (node = root; ; )
1249     {
1250       reg_errcode_t err = fn (extra, node);
1251       if (BE (err != REG_NOERROR, 0))
1252         return err;
1253
1254       /* Go to the left node, or up and to the right.  */
1255       if (node->left)
1256         node = node->left;
1257       else
1258         {
1259           bin_tree_t *prev = NULL;
1260           while (node->right == prev || node->right == NULL)
1261             {
1262               prev = node;
1263               node = node->parent;
1264               if (!node)
1265                 return REG_NOERROR;
1266             }
1267           node = node->right;
1268         }
1269     }
1270 }
1271
1272 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1273    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1274    backreferences as well.  Requires a preorder visit.  */
1275 static reg_errcode_t
1276 optimize_subexps (void *extra, bin_tree_t *node)
1277 {
1278   re_dfa_t *dfa = (re_dfa_t *) extra;
1279
1280   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1281     {
1282       int idx = node->token.opr.idx;
1283       node->token.opr.idx = dfa->subexp_map[idx];
1284       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1285     }
1286
1287   else if (node->token.type == SUBEXP
1288            && node->left && node->left->token.type == SUBEXP)
1289     {
1290       Idx other_idx = node->left->token.opr.idx;
1291
1292       node->left = node->left->left;
1293       if (node->left)
1294         node->left->parent = node;
1295
1296       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1297       if (other_idx < BITSET_WORD_BITS)
1298         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1299     }
1300
1301   return REG_NOERROR;
1302 }
1303
1304 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1305    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1306 static reg_errcode_t
1307 lower_subexps (void *extra, bin_tree_t *node)
1308 {
1309   regex_t *preg = (regex_t *) extra;
1310   reg_errcode_t err = REG_NOERROR;
1311
1312   if (node->left && node->left->token.type == SUBEXP)
1313     {
1314       node->left = lower_subexp (&err, preg, node->left);
1315       if (node->left)
1316         node->left->parent = node;
1317     }
1318   if (node->right && node->right->token.type == SUBEXP)
1319     {
1320       node->right = lower_subexp (&err, preg, node->right);
1321       if (node->right)
1322         node->right->parent = node;
1323     }
1324
1325   return err;
1326 }
1327
1328 static bin_tree_t *
1329 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1330 {
1331   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1332   bin_tree_t *body = node->left;
1333   bin_tree_t *op, *cls, *tree1, *tree;
1334
1335   if (preg->no_sub
1336       /* We do not optimize empty subexpressions, because otherwise we may
1337          have bad CONCAT nodes with NULL children.  This is obviously not
1338          very common, so we do not lose much.  An example that triggers
1339          this case is the sed "script" /\(\)/x.  */
1340       && node->left != NULL
1341       && (node->token.opr.idx >= BITSET_WORD_BITS
1342           || !(dfa->used_bkref_map
1343                & ((bitset_word_t) 1 << node->token.opr.idx))))
1344     return node->left;
1345
1346   /* Convert the SUBEXP node to the concatenation of an
1347      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1348   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1349   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1350   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1351   tree = create_tree (dfa, op, tree1, CONCAT);
1352   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1353     {
1354       *err = REG_ESPACE;
1355       return NULL;
1356     }
1357
1358   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1359   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1360   return tree;
1361 }
1362
1363 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1364    nodes.  Requires a postorder visit.  */
1365 static reg_errcode_t
1366 calc_first (void *extra, bin_tree_t *node)
1367 {
1368   re_dfa_t *dfa = (re_dfa_t *) extra;
1369   if (node->token.type == CONCAT)
1370     {
1371       node->first = node->left->first;
1372       node->node_idx = node->left->node_idx;
1373     }
1374   else
1375     {
1376       node->first = node;
1377       node->node_idx = re_dfa_add_node (dfa, node->token);
1378       if (BE (node->node_idx == REG_MISSING, 0))
1379         return REG_ESPACE;
1380       if (node->token.type == ANCHOR)
1381         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1382     }
1383   return REG_NOERROR;
1384 }
1385
1386 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1387 static reg_errcode_t
1388 calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
1389 {
1390   switch (node->token.type)
1391     {
1392     case OP_DUP_ASTERISK:
1393       node->left->next = node;
1394       break;
1395     case CONCAT:
1396       node->left->next = node->right->first;
1397       node->right->next = node->next;
1398       break;
1399     default:
1400       if (node->left)
1401         node->left->next = node->next;
1402       if (node->right)
1403         node->right->next = node->next;
1404       break;
1405     }
1406   return REG_NOERROR;
1407 }
1408
1409 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1410 static reg_errcode_t
1411 link_nfa_nodes (void *extra, bin_tree_t *node)
1412 {
1413   re_dfa_t *dfa = (re_dfa_t *) extra;
1414   Idx idx = node->node_idx;
1415   reg_errcode_t err = REG_NOERROR;
1416
1417   switch (node->token.type)
1418     {
1419     case CONCAT:
1420       break;
1421
1422     case END_OF_RE:
1423       assert (node->next == NULL);
1424       break;
1425
1426     case OP_DUP_ASTERISK:
1427     case OP_ALT:
1428       {
1429         Idx left, right;
1430         dfa->has_plural_match = 1;
1431         if (node->left != NULL)
1432           left = node->left->first->node_idx;
1433         else
1434           left = node->next->node_idx;
1435         if (node->right != NULL)
1436           right = node->right->first->node_idx;
1437         else
1438           right = node->next->node_idx;
1439         assert (REG_VALID_INDEX (left));
1440         assert (REG_VALID_INDEX (right));
1441         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1442       }
1443       break;
1444
1445     case ANCHOR:
1446     case OP_OPEN_SUBEXP:
1447     case OP_CLOSE_SUBEXP:
1448       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1449       break;
1450
1451     case OP_BACK_REF:
1452       dfa->nexts[idx] = node->next->node_idx;
1453       if (node->token.type == OP_BACK_REF)
1454         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1455       break;
1456
1457     default:
1458       assert (!IS_EPSILON_NODE (node->token.type));
1459       dfa->nexts[idx] = node->next->node_idx;
1460       break;
1461     }
1462
1463   return err;
1464 }
1465
1466 /* Duplicate the epsilon closure of the node ROOT_NODE.
1467    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1468    to their own constraint.  */
1469
1470 static reg_errcode_t
1471 internal_function
1472 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1473                         Idx root_node, unsigned int init_constraint)
1474 {
1475   Idx org_node, clone_node;
1476   bool ok;
1477   unsigned int constraint = init_constraint;
1478   for (org_node = top_org_node, clone_node = top_clone_node;;)
1479     {
1480       Idx org_dest, clone_dest;
1481       if (dfa->nodes[org_node].type == OP_BACK_REF)
1482         {
1483           /* If the back reference epsilon-transit, its destination must
1484              also have the constraint.  Then duplicate the epsilon closure
1485              of the destination of the back reference, and store it in
1486              edests of the back reference.  */
1487           org_dest = dfa->nexts[org_node];
1488           re_node_set_empty (dfa->edests + clone_node);
1489           clone_dest = duplicate_node (dfa, org_dest, constraint);
1490           if (BE (clone_dest == REG_MISSING, 0))
1491             return REG_ESPACE;
1492           dfa->nexts[clone_node] = dfa->nexts[org_node];
1493           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1494           if (BE (! ok, 0))
1495             return REG_ESPACE;
1496         }
1497       else if (dfa->edests[org_node].nelem == 0)
1498         {
1499           /* In case of the node can't epsilon-transit, don't duplicate the
1500              destination and store the original destination as the
1501              destination of the node.  */
1502           dfa->nexts[clone_node] = dfa->nexts[org_node];
1503           break;
1504         }
1505       else if (dfa->edests[org_node].nelem == 1)
1506         {
1507           /* In case of the node can epsilon-transit, and it has only one
1508              destination.  */
1509           org_dest = dfa->edests[org_node].elems[0];
1510           re_node_set_empty (dfa->edests + clone_node);
1511           /* If the node is root_node itself, it means the epsilon closure
1512              has a loop.  Then tie it to the destination of the root_node.  */
1513           if (org_node == root_node && clone_node != org_node)
1514             {
1515               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1516               if (BE (! ok, 0))
1517                 return REG_ESPACE;
1518               break;
1519             }
1520           /* In case the node has another constraint, append it.  */
1521           constraint |= dfa->nodes[org_node].constraint;
1522           clone_dest = duplicate_node (dfa, org_dest, constraint);
1523           if (BE (clone_dest == REG_MISSING, 0))
1524             return REG_ESPACE;
1525           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526           if (BE (! ok, 0))
1527             return REG_ESPACE;
1528         }
1529       else /* dfa->edests[org_node].nelem == 2 */
1530         {
1531           /* In case of the node can epsilon-transit, and it has two
1532              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1533           org_dest = dfa->edests[org_node].elems[0];
1534           re_node_set_empty (dfa->edests + clone_node);
1535           /* Search for a duplicated node which satisfies the constraint.  */
1536           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1537           if (clone_dest == REG_MISSING)
1538             {
1539               /* There is no such duplicated node, create a new one.  */
1540               reg_errcode_t err;
1541               clone_dest = duplicate_node (dfa, org_dest, constraint);
1542               if (BE (clone_dest == REG_MISSING, 0))
1543                 return REG_ESPACE;
1544               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1545               if (BE (! ok, 0))
1546                 return REG_ESPACE;
1547               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1548                                             root_node, constraint);
1549               if (BE (err != REG_NOERROR, 0))
1550                 return err;
1551             }
1552           else
1553             {
1554               /* There is a duplicated node which satisfies the constraint,
1555                  use it to avoid infinite loop.  */
1556               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557               if (BE (! ok, 0))
1558                 return REG_ESPACE;
1559             }
1560
1561           org_dest = dfa->edests[org_node].elems[1];
1562           clone_dest = duplicate_node (dfa, org_dest, constraint);
1563           if (BE (clone_dest == REG_MISSING, 0))
1564             return REG_ESPACE;
1565           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566           if (BE (! ok, 0))
1567             return REG_ESPACE;
1568         }
1569       org_node = org_dest;
1570       clone_node = clone_dest;
1571     }
1572   return REG_NOERROR;
1573 }
1574
1575 /* Search for a node which is duplicated from the node ORG_NODE, and
1576    satisfies the constraint CONSTRAINT.  */
1577
1578 static Idx
1579 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1580                         unsigned int constraint)
1581 {
1582   Idx idx;
1583   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1584     {
1585       if (org_node == dfa->org_indices[idx]
1586           && constraint == dfa->nodes[idx].constraint)
1587         return idx; /* Found.  */
1588     }
1589   return REG_MISSING; /* Not found.  */
1590 }
1591
1592 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1593    Return the index of the new node, or REG_MISSING if insufficient storage is
1594    available.  */
1595
1596 static Idx
1597 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1598 {
1599   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1600   if (BE (dup_idx != REG_MISSING, 1))
1601     {
1602       dfa->nodes[dup_idx].constraint = constraint;
1603       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1604       dfa->nodes[dup_idx].duplicated = 1;
1605
1606       /* Store the index of the original node.  */
1607       dfa->org_indices[dup_idx] = org_idx;
1608     }
1609   return dup_idx;
1610 }
1611
1612 static reg_errcode_t
1613 calc_inveclosure (re_dfa_t *dfa)
1614 {
1615   Idx src, idx;
1616   bool ok;
1617   for (idx = 0; idx < dfa->nodes_len; ++idx)
1618     re_node_set_init_empty (dfa->inveclosures + idx);
1619
1620   for (src = 0; src < dfa->nodes_len; ++src)
1621     {
1622       Idx *elems = dfa->eclosures[src].elems;
1623       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1624         {
1625           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1626           if (BE (! ok, 0))
1627             return REG_ESPACE;
1628         }
1629     }
1630
1631   return REG_NOERROR;
1632 }
1633
1634 /* Calculate "eclosure" for all the node in DFA.  */
1635
1636 static reg_errcode_t
1637 calc_eclosure (re_dfa_t *dfa)
1638 {
1639   Idx node_idx;
1640   bool incomplete;
1641 #ifdef DEBUG
1642   assert (dfa->nodes_len > 0);
1643 #endif
1644   incomplete = false;
1645   /* For each nodes, calculate epsilon closure.  */
1646   for (node_idx = 0; ; ++node_idx)
1647     {
1648       reg_errcode_t err;
1649       re_node_set eclosure_elem;
1650       if (node_idx == dfa->nodes_len)
1651         {
1652           if (!incomplete)
1653             break;
1654           incomplete = false;
1655           node_idx = 0;
1656         }
1657
1658 #ifdef DEBUG
1659       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1660 #endif
1661
1662       /* If we have already calculated, skip it.  */
1663       if (dfa->eclosures[node_idx].nelem != 0)
1664         continue;
1665       /* Calculate epsilon closure of `node_idx'.  */
1666       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1667       if (BE (err != REG_NOERROR, 0))
1668         return err;
1669
1670       if (dfa->eclosures[node_idx].nelem == 0)
1671         {
1672           incomplete = true;
1673           re_node_set_free (&eclosure_elem);
1674         }
1675     }
1676   return REG_NOERROR;
1677 }
1678
1679 /* Calculate epsilon closure of NODE.  */
1680
1681 static reg_errcode_t
1682 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1683 {
1684   reg_errcode_t err;
1685   Idx i;
1686   re_node_set eclosure;
1687   bool ok;
1688   bool incomplete = false;
1689   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1690   if (BE (err != REG_NOERROR, 0))
1691     return err;
1692
1693   /* This indicates that we are calculating this node now.
1694      We reference this value to avoid infinite loop.  */
1695   dfa->eclosures[node].nelem = REG_MISSING;
1696
1697   /* If the current node has constraints, duplicate all nodes
1698      since they must inherit the constraints.  */
1699   if (dfa->nodes[node].constraint
1700       && dfa->edests[node].nelem
1701       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1702     {
1703       err = duplicate_node_closure (dfa, node, node, node,
1704                                     dfa->nodes[node].constraint);
1705       if (BE (err != REG_NOERROR, 0))
1706         return err;
1707     }
1708
1709   /* Expand each epsilon destination nodes.  */
1710   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1711     for (i = 0; i < dfa->edests[node].nelem; ++i)
1712       {
1713         re_node_set eclosure_elem;
1714         Idx edest = dfa->edests[node].elems[i];
1715         /* If calculating the epsilon closure of `edest' is in progress,
1716            return intermediate result.  */
1717         if (dfa->eclosures[edest].nelem == REG_MISSING)
1718           {
1719             incomplete = true;
1720             continue;
1721           }
1722         /* If we haven't calculated the epsilon closure of `edest' yet,
1723            calculate now. Otherwise use calculated epsilon closure.  */
1724         if (dfa->eclosures[edest].nelem == 0)
1725           {
1726             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1727             if (BE (err != REG_NOERROR, 0))
1728               return err;
1729           }
1730         else
1731           eclosure_elem = dfa->eclosures[edest];
1732         /* Merge the epsilon closure of `edest'.  */
1733         err = re_node_set_merge (&eclosure, &eclosure_elem);
1734         if (BE (err != REG_NOERROR, 0))
1735           return err;
1736         /* If the epsilon closure of `edest' is incomplete,
1737            the epsilon closure of this node is also incomplete.  */
1738         if (dfa->eclosures[edest].nelem == 0)
1739           {
1740             incomplete = true;
1741             re_node_set_free (&eclosure_elem);
1742           }
1743       }
1744
1745   /* An epsilon closure includes itself.  */
1746   ok = re_node_set_insert (&eclosure, node);
1747   if (BE (! ok, 0))
1748     return REG_ESPACE;
1749   if (incomplete && !root)
1750     dfa->eclosures[node].nelem = 0;
1751   else
1752     dfa->eclosures[node] = eclosure;
1753   *new_set = eclosure;
1754   return REG_NOERROR;
1755 }
1756 \f
1757 /* Functions for token which are used in the parser.  */
1758
1759 /* Fetch a token from INPUT.
1760    We must not use this function inside bracket expressions.  */
1761
1762 static void
1763 internal_function
1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1765 {
1766   re_string_skip_bytes (input, peek_token (result, input, syntax));
1767 }
1768
1769 /* Peek a token from INPUT, and return the length of the token.
1770    We must not use this function inside bracket expressions.  */
1771
1772 static int
1773 internal_function
1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1775 {
1776   unsigned char c;
1777
1778   if (re_string_eoi (input))
1779     {
1780       token->type = END_OF_RE;
1781       return 0;
1782     }
1783
1784   c = re_string_peek_byte (input, 0);
1785   token->opr.c = c;
1786
1787   token->word_char = 0;
1788 #ifdef RE_ENABLE_I18N
1789   token->mb_partial = 0;
1790   if (input->mb_cur_max > 1 &&
1791       !re_string_first_byte (input, re_string_cur_idx (input)))
1792     {
1793       token->type = CHARACTER;
1794       token->mb_partial = 1;
1795       return 1;
1796     }
1797 #endif
1798   if (c == '\\')
1799     {
1800       unsigned char c2;
1801       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1802         {
1803           token->type = BACK_SLASH;
1804           return 1;
1805         }
1806
1807       c2 = re_string_peek_byte_case (input, 1);
1808       token->opr.c = c2;
1809       token->type = CHARACTER;
1810 #ifdef RE_ENABLE_I18N
1811       if (input->mb_cur_max > 1)
1812         {
1813           wint_t wc = re_string_wchar_at (input,
1814                                           re_string_cur_idx (input) + 1);
1815           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1816         }
1817       else
1818 #endif
1819         token->word_char = IS_WORD_CHAR (c2) != 0;
1820
1821       switch (c2)
1822         {
1823         case '|':
1824           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1825             token->type = OP_ALT;
1826           break;
1827         case '1': case '2': case '3': case '4': case '5':
1828         case '6': case '7': case '8': case '9':
1829           if (!(syntax & RE_NO_BK_REFS))
1830             {
1831               token->type = OP_BACK_REF;
1832               token->opr.idx = c2 - '1';
1833             }
1834           break;
1835         case '<':
1836           if (!(syntax & RE_NO_GNU_OPS))
1837             {
1838               token->type = ANCHOR;
1839               token->opr.ctx_type = WORD_FIRST;
1840             }
1841           break;
1842         case '>':
1843           if (!(syntax & RE_NO_GNU_OPS))
1844             {
1845               token->type = ANCHOR;
1846               token->opr.ctx_type = WORD_LAST;
1847             }
1848           break;
1849         case 'b':
1850           if (!(syntax & RE_NO_GNU_OPS))
1851             {
1852               token->type = ANCHOR;
1853               token->opr.ctx_type = WORD_DELIM;
1854             }
1855           break;
1856         case 'B':
1857           if (!(syntax & RE_NO_GNU_OPS))
1858             {
1859               token->type = ANCHOR;
1860               token->opr.ctx_type = NOT_WORD_DELIM;
1861             }
1862           break;
1863         case 'w':
1864           if (!(syntax & RE_NO_GNU_OPS))
1865             token->type = OP_WORD;
1866           break;
1867         case 'W':
1868           if (!(syntax & RE_NO_GNU_OPS))
1869             token->type = OP_NOTWORD;
1870           break;
1871         case 's':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             token->type = OP_SPACE;
1874           break;
1875         case 'S':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             token->type = OP_NOTSPACE;
1878           break;
1879         case '`':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             {
1882               token->type = ANCHOR;
1883               token->opr.ctx_type = BUF_FIRST;
1884             }
1885           break;
1886         case '\'':
1887           if (!(syntax & RE_NO_GNU_OPS))
1888             {
1889               token->type = ANCHOR;
1890               token->opr.ctx_type = BUF_LAST;
1891             }
1892           break;
1893         case '(':
1894           if (!(syntax & RE_NO_BK_PARENS))
1895             token->type = OP_OPEN_SUBEXP;
1896           break;
1897         case ')':
1898           if (!(syntax & RE_NO_BK_PARENS))
1899             token->type = OP_CLOSE_SUBEXP;
1900           break;
1901         case '+':
1902           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1903             token->type = OP_DUP_PLUS;
1904           break;
1905         case '?':
1906           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907             token->type = OP_DUP_QUESTION;
1908           break;
1909         case '{':
1910           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1911             token->type = OP_OPEN_DUP_NUM;
1912           break;
1913         case '}':
1914           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915             token->type = OP_CLOSE_DUP_NUM;
1916           break;
1917         default:
1918           break;
1919         }
1920       return 2;
1921     }
1922
1923   token->type = CHARACTER;
1924 #ifdef RE_ENABLE_I18N
1925   if (input->mb_cur_max > 1)
1926     {
1927       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1928       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1929     }
1930   else
1931 #endif
1932     token->word_char = IS_WORD_CHAR (token->opr.c);
1933
1934   switch (c)
1935     {
1936     case '\n':
1937       if (syntax & RE_NEWLINE_ALT)
1938         token->type = OP_ALT;
1939       break;
1940     case '|':
1941       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1942         token->type = OP_ALT;
1943       break;
1944     case '*':
1945       token->type = OP_DUP_ASTERISK;
1946       break;
1947     case '+':
1948       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1949         token->type = OP_DUP_PLUS;
1950       break;
1951     case '?':
1952       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953         token->type = OP_DUP_QUESTION;
1954       break;
1955     case '{':
1956       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1957         token->type = OP_OPEN_DUP_NUM;
1958       break;
1959     case '}':
1960       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961         token->type = OP_CLOSE_DUP_NUM;
1962       break;
1963     case '(':
1964       if (syntax & RE_NO_BK_PARENS)
1965         token->type = OP_OPEN_SUBEXP;
1966       break;
1967     case ')':
1968       if (syntax & RE_NO_BK_PARENS)
1969         token->type = OP_CLOSE_SUBEXP;
1970       break;
1971     case '[':
1972       token->type = OP_OPEN_BRACKET;
1973       break;
1974     case '.':
1975       token->type = OP_PERIOD;
1976       break;
1977     case '^':
1978       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1979           re_string_cur_idx (input) != 0)
1980         {
1981           char prev = re_string_peek_byte (input, -1);
1982           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1983             break;
1984         }
1985       token->type = ANCHOR;
1986       token->opr.ctx_type = LINE_FIRST;
1987       break;
1988     case '$':
1989       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1990           re_string_cur_idx (input) + 1 != re_string_length (input))
1991         {
1992           re_token_t next;
1993           re_string_skip_bytes (input, 1);
1994           peek_token (&next, input, syntax);
1995           re_string_skip_bytes (input, -1);
1996           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1997             break;
1998         }
1999       token->type = ANCHOR;
2000       token->opr.ctx_type = LINE_LAST;
2001       break;
2002     default:
2003       break;
2004     }
2005   return 1;
2006 }
2007
2008 /* Peek a token from INPUT, and return the length of the token.
2009    We must not use this function out of bracket expressions.  */
2010
2011 static int
2012 internal_function
2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2014 {
2015   unsigned char c;
2016   if (re_string_eoi (input))
2017     {
2018       token->type = END_OF_RE;
2019       return 0;
2020     }
2021   c = re_string_peek_byte (input, 0);
2022   token->opr.c = c;
2023
2024 #ifdef RE_ENABLE_I18N
2025   if (input->mb_cur_max > 1 &&
2026       !re_string_first_byte (input, re_string_cur_idx (input)))
2027     {
2028       token->type = CHARACTER;
2029       return 1;
2030     }
2031 #endif /* RE_ENABLE_I18N */
2032
2033   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2034       && re_string_cur_idx (input) + 1 < re_string_length (input))
2035     {
2036       /* In this case, '\' escape a character.  */
2037       unsigned char c2;
2038       re_string_skip_bytes (input, 1);
2039       c2 = re_string_peek_byte (input, 0);
2040       token->opr.c = c2;
2041       token->type = CHARACTER;
2042       return 1;
2043     }
2044   if (c == '[') /* '[' is a special char in a bracket exps.  */
2045     {
2046       unsigned char c2;
2047       int token_len;
2048       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2049         c2 = re_string_peek_byte (input, 1);
2050       else
2051         c2 = 0;
2052       token->opr.c = c2;
2053       token_len = 2;
2054       switch (c2)
2055         {
2056         case '.':
2057           token->type = OP_OPEN_COLL_ELEM;
2058           break;
2059         case '=':
2060           token->type = OP_OPEN_EQUIV_CLASS;
2061           break;
2062         case ':':
2063           if (syntax & RE_CHAR_CLASSES)
2064             {
2065               token->type = OP_OPEN_CHAR_CLASS;
2066               break;
2067             }
2068           /* else fall through.  */
2069         default:
2070           token->type = CHARACTER;
2071           token->opr.c = c;
2072           token_len = 1;
2073           break;
2074         }
2075       return token_len;
2076     }
2077   switch (c)
2078     {
2079     case '-':
2080       token->type = OP_CHARSET_RANGE;
2081       break;
2082     case ']':
2083       token->type = OP_CLOSE_BRACKET;
2084       break;
2085     case '^':
2086       token->type = OP_NON_MATCH_LIST;
2087       break;
2088     default:
2089       token->type = CHARACTER;
2090     }
2091   return 1;
2092 }
2093 \f
2094 /* Functions for parser.  */
2095
2096 /* Entry point of the parser.
2097    Parse the regular expression REGEXP and return the structure tree.
2098    If an error is occured, ERR is set by error code, and return NULL.
2099    This function build the following tree, from regular expression <reg_exp>:
2100            CAT
2101            / \
2102           /   \
2103    <reg_exp>  EOR
2104
2105    CAT means concatenation.
2106    EOR means end of regular expression.  */
2107
2108 static bin_tree_t *
2109 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2110        reg_errcode_t *err)
2111 {
2112   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113   bin_tree_t *tree, *eor, *root;
2114   re_token_t current_token;
2115   dfa->syntax = syntax;
2116   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2117   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2118   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119     return NULL;
2120   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2121   if (tree != NULL)
2122     root = create_tree (dfa, tree, eor, CONCAT);
2123   else
2124     root = eor;
2125   if (BE (eor == NULL || root == NULL, 0))
2126     {
2127       *err = REG_ESPACE;
2128       return NULL;
2129     }
2130   return root;
2131 }
2132
2133 /* This function build the following tree, from regular expression
2134    <branch1>|<branch2>:
2135            ALT
2136            / \
2137           /   \
2138    <branch1> <branch2>
2139
2140    ALT means alternative, which represents the operator `|'.  */
2141
2142 static bin_tree_t *
2143 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2145 {
2146   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147   bin_tree_t *tree, *branch = NULL;
2148   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2149   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150     return NULL;
2151
2152   while (token->type == OP_ALT)
2153     {
2154       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155       if (token->type != OP_ALT && token->type != END_OF_RE
2156           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2157         {
2158           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2159           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2160             return NULL;
2161         }
2162       else
2163         branch = NULL;
2164       tree = create_tree (dfa, tree, branch, OP_ALT);
2165       if (BE (tree == NULL, 0))
2166         {
2167           *err = REG_ESPACE;
2168           return NULL;
2169         }
2170     }
2171   return tree;
2172 }
2173
2174 /* This function build the following tree, from regular expression
2175    <exp1><exp2>:
2176         CAT
2177         / \
2178        /   \
2179    <exp1> <exp2>
2180
2181    CAT means concatenation.  */
2182
2183 static bin_tree_t *
2184 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2185               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2186 {
2187   bin_tree_t *tree, *expr;
2188   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2189   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2190   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2191     return NULL;
2192
2193   while (token->type != OP_ALT && token->type != END_OF_RE
2194          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2195     {
2196       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2197       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2198         {
2199           return NULL;
2200         }
2201       if (tree != NULL && expr != NULL)
2202         {
2203           tree = create_tree (dfa, tree, expr, CONCAT);
2204           if (tree == NULL)
2205             {
2206               *err = REG_ESPACE;
2207               return NULL;
2208             }
2209         }
2210       else if (tree == NULL)
2211         tree = expr;
2212       /* Otherwise expr == NULL, we don't need to create new tree.  */
2213     }
2214   return tree;
2215 }
2216
2217 /* This function build the following tree, from regular expression a*:
2218          *
2219          |
2220          a
2221 */
2222
2223 static bin_tree_t *
2224 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2226 {
2227   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228   bin_tree_t *tree;
2229   switch (token->type)
2230     {
2231     case CHARACTER:
2232       tree = create_token_tree (dfa, NULL, NULL, token);
2233       if (BE (tree == NULL, 0))
2234         {
2235           *err = REG_ESPACE;
2236           return NULL;
2237         }
2238 #ifdef RE_ENABLE_I18N
2239       if (dfa->mb_cur_max > 1)
2240         {
2241           while (!re_string_eoi (regexp)
2242                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243             {
2244               bin_tree_t *mbc_remain;
2245               fetch_token (token, regexp, syntax);
2246               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248               if (BE (mbc_remain == NULL || tree == NULL, 0))
2249                 {
2250                   *err = REG_ESPACE;
2251                   return NULL;
2252                 }
2253             }
2254         }
2255 #endif
2256       break;
2257     case OP_OPEN_SUBEXP:
2258       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260         return NULL;
2261       break;
2262     case OP_OPEN_BRACKET:
2263       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265         return NULL;
2266       break;
2267     case OP_BACK_REF:
2268       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2269         {
2270           *err = REG_ESUBREG;
2271           return NULL;
2272         }
2273       dfa->used_bkref_map |= 1 << token->opr.idx;
2274       tree = create_token_tree (dfa, NULL, NULL, token);
2275       if (BE (tree == NULL, 0))
2276         {
2277           *err = REG_ESPACE;
2278           return NULL;
2279         }
2280       ++dfa->nbackref;
2281       dfa->has_mb_node = 1;
2282       break;
2283     case OP_OPEN_DUP_NUM:
2284       if (syntax & RE_CONTEXT_INVALID_DUP)
2285         {
2286           *err = REG_BADRPT;
2287           return NULL;
2288         }
2289       /* FALLTHROUGH */
2290     case OP_DUP_ASTERISK:
2291     case OP_DUP_PLUS:
2292     case OP_DUP_QUESTION:
2293       if (syntax & RE_CONTEXT_INVALID_OPS)
2294         {
2295           *err = REG_BADRPT;
2296           return NULL;
2297         }
2298       else if (syntax & RE_CONTEXT_INDEP_OPS)
2299         {
2300           fetch_token (token, regexp, syntax);
2301           return parse_expression (regexp, preg, token, syntax, nest, err);
2302         }
2303       /* else fall through  */
2304     case OP_CLOSE_SUBEXP:
2305       if ((token->type == OP_CLOSE_SUBEXP) &&
2306           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2307         {
2308           *err = REG_ERPAREN;
2309           return NULL;
2310         }
2311       /* else fall through  */
2312     case OP_CLOSE_DUP_NUM:
2313       /* We treat it as a normal character.  */
2314
2315       /* Then we can these characters as normal characters.  */
2316       token->type = CHARACTER;
2317       /* mb_partial and word_char bits should be initialized already
2318          by peek_token.  */
2319       tree = create_token_tree (dfa, NULL, NULL, token);
2320       if (BE (tree == NULL, 0))
2321         {
2322           *err = REG_ESPACE;
2323           return NULL;
2324         }
2325       break;
2326     case ANCHOR:
2327       if ((token->opr.ctx_type
2328            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329           && dfa->word_ops_used == 0)
2330         init_word_char (dfa);
2331       if (token->opr.ctx_type == WORD_DELIM
2332           || token->opr.ctx_type == NOT_WORD_DELIM)
2333         {
2334           bin_tree_t *tree_first, *tree_last;
2335           if (token->opr.ctx_type == WORD_DELIM)
2336             {
2337               token->opr.ctx_type = WORD_FIRST;
2338               tree_first = create_token_tree (dfa, NULL, NULL, token);
2339               token->opr.ctx_type = WORD_LAST;
2340             }
2341           else
2342             {
2343               token->opr.ctx_type = INSIDE_WORD;
2344               tree_first = create_token_tree (dfa, NULL, NULL, token);
2345               token->opr.ctx_type = INSIDE_NOTWORD;
2346             }
2347           tree_last = create_token_tree (dfa, NULL, NULL, token);
2348           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2350             {
2351               *err = REG_ESPACE;
2352               return NULL;
2353             }
2354         }
2355       else
2356         {
2357           tree = create_token_tree (dfa, NULL, NULL, token);
2358           if (BE (tree == NULL, 0))
2359             {
2360               *err = REG_ESPACE;
2361               return NULL;
2362             }
2363         }
2364       /* We must return here, since ANCHORs can't be followed
2365          by repetition operators.
2366          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2368       fetch_token (token, regexp, syntax);
2369       return tree;
2370     case OP_PERIOD:
2371       tree = create_token_tree (dfa, NULL, NULL, token);
2372       if (BE (tree == NULL, 0))
2373         {
2374           *err = REG_ESPACE;
2375           return NULL;
2376         }
2377       if (dfa->mb_cur_max > 1)
2378         dfa->has_mb_node = 1;
2379       break;
2380     case OP_WORD:
2381     case OP_NOTWORD:
2382       tree = build_charclass_op (dfa, regexp->trans,
2383                                  (const unsigned char *) "alnum",
2384                                  (const unsigned char *) "_",
2385                                  token->type == OP_NOTWORD, err);
2386       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387         return NULL;
2388       break;
2389     case OP_SPACE:
2390     case OP_NOTSPACE:
2391       tree = build_charclass_op (dfa, regexp->trans,
2392                                  (const unsigned char *) "space",
2393                                  (const unsigned char *) "",
2394                                  token->type == OP_NOTSPACE, err);
2395       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396         return NULL;
2397       break;
2398     case OP_ALT:
2399     case END_OF_RE:
2400       return NULL;
2401     case BACK_SLASH:
2402       *err = REG_EESCAPE;
2403       return NULL;
2404     default:
2405       /* Must not happen?  */
2406 #ifdef DEBUG
2407       assert (0);
2408 #endif
2409       return NULL;
2410     }
2411   fetch_token (token, regexp, syntax);
2412
2413   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415     {
2416       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418         return NULL;
2419       /* In BRE consecutive duplications are not allowed.  */
2420       if ((syntax & RE_CONTEXT_INVALID_DUP)
2421           && (token->type == OP_DUP_ASTERISK
2422               || token->type == OP_OPEN_DUP_NUM))
2423         {
2424           *err = REG_BADRPT;
2425           return NULL;
2426         }
2427     }
2428
2429   return tree;
2430 }
2431
2432 /* This function build the following tree, from regular expression
2433    (<reg_exp>):
2434          SUBEXP
2435             |
2436         <reg_exp>
2437 */
2438
2439 static bin_tree_t *
2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2442 {
2443   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444   bin_tree_t *tree;
2445   size_t cur_nsub;
2446   cur_nsub = preg->re_nsub++;
2447
2448   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449
2450   /* The subexpression may be a null string.  */
2451   if (token->type == OP_CLOSE_SUBEXP)
2452     tree = NULL;
2453   else
2454     {
2455       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457         *err = REG_EPAREN;
2458       if (BE (*err != REG_NOERROR, 0))
2459         return NULL;
2460     }
2461
2462   if (cur_nsub <= '9' - '1')
2463     dfa->completed_bkref_map |= 1 << cur_nsub;
2464
2465   tree = create_tree (dfa, tree, NULL, SUBEXP);
2466   if (BE (tree == NULL, 0))
2467     {
2468       *err = REG_ESPACE;
2469       return NULL;
2470     }
2471   tree->token.opr.idx = cur_nsub;
2472   return tree;
2473 }
2474
2475 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2476
2477 static bin_tree_t *
2478 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2480 {
2481   bin_tree_t *tree = NULL, *old_tree = NULL;
2482   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2483   re_token_t start_token = *token;
2484
2485   if (token->type == OP_OPEN_DUP_NUM)
2486     {
2487       end = 0;
2488       start = fetch_number (regexp, token, syntax);
2489       if (start == REG_MISSING)
2490         {
2491           if (token->type == CHARACTER && token->opr.c == ',')
2492             start = 0; /* We treat "{,m}" as "{0,m}".  */
2493           else
2494             {
2495               *err = REG_BADBR; /* <re>{} is invalid.  */
2496               return NULL;
2497             }
2498         }
2499       if (BE (start != REG_ERROR, 1))
2500         {
2501           /* We treat "{n}" as "{n,n}".  */
2502           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2503                  : ((token->type == CHARACTER && token->opr.c == ',')
2504                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2505         }
2506       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2507         {
2508           /* Invalid sequence.  */
2509           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2510             {
2511               if (token->type == END_OF_RE)
2512                 *err = REG_EBRACE;
2513               else
2514                 *err = REG_BADBR;
2515
2516               return NULL;
2517             }
2518
2519           /* If the syntax bit is set, rollback.  */
2520           re_string_set_index (regexp, start_idx);
2521           *token = start_token;
2522           token->type = CHARACTER;
2523           /* mb_partial and word_char bits should be already initialized by
2524              peek_token.  */
2525           return elem;
2526         }
2527
2528       if (BE ((end != REG_MISSING && start > end)
2529               || token->type != OP_CLOSE_DUP_NUM, 0))
2530         {
2531           /* First number greater than second.  */
2532           *err = REG_BADBR;
2533           return NULL;
2534         }
2535     }
2536   else
2537     {
2538       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2539       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2540     }
2541
2542   fetch_token (token, regexp, syntax);
2543
2544   if (BE (elem == NULL, 0))
2545     return NULL;
2546   if (BE (start == 0 && end == 0, 0))
2547     {
2548       postorder (elem, free_tree, NULL);
2549       return NULL;
2550     }
2551
2552   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2553   if (BE (start > 0, 0))
2554     {
2555       tree = elem;
2556       for (i = 2; i <= start; ++i)
2557         {
2558           elem = duplicate_tree (elem, dfa);
2559           tree = create_tree (dfa, tree, elem, CONCAT);
2560           if (BE (elem == NULL || tree == NULL, 0))
2561             goto parse_dup_op_espace;
2562         }
2563
2564       if (start == end)
2565         return tree;
2566
2567       /* Duplicate ELEM before it is marked optional.  */
2568       elem = duplicate_tree (elem, dfa);
2569       old_tree = tree;
2570     }
2571   else
2572     old_tree = NULL;
2573
2574   if (elem->token.type == SUBEXP)
2575     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2576
2577   tree = create_tree (dfa, elem, NULL,
2578                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2579   if (BE (tree == NULL, 0))
2580     goto parse_dup_op_espace;
2581
2582 /* From gnulib's "intprops.h":
2583    True if the arithmetic type T is signed.  */
2584 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2585
2586   /* This loop is actually executed only when end != REG_MISSING,
2587      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2588      already created the start+1-th copy.  */
2589   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2590     for (i = start + 2; i <= end; ++i)
2591       {
2592         elem = duplicate_tree (elem, dfa);
2593         tree = create_tree (dfa, tree, elem, CONCAT);
2594         if (BE (elem == NULL || tree == NULL, 0))
2595           goto parse_dup_op_espace;
2596
2597         tree = create_tree (dfa, tree, NULL, OP_ALT);
2598         if (BE (tree == NULL, 0))
2599           goto parse_dup_op_espace;
2600       }
2601
2602   if (old_tree)
2603     tree = create_tree (dfa, old_tree, tree, CONCAT);
2604
2605   return tree;
2606
2607  parse_dup_op_espace:
2608   *err = REG_ESPACE;
2609   return NULL;
2610 }
2611
2612 /* Size of the names for collating symbol/equivalence_class/character_class.
2613    I'm not sure, but maybe enough.  */
2614 #define BRACKET_NAME_BUF_SIZE 32
2615
2616 #ifndef _LIBC
2617   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2618      Build the range expression which starts from START_ELEM, and ends
2619      at END_ELEM.  The result are written to MBCSET and SBCSET.
2620      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2621      mbcset->range_ends, is a pointer argument sinse we may
2622      update it.  */
2623
2624 static reg_errcode_t
2625 internal_function
2626 # ifdef RE_ENABLE_I18N
2627 build_range_exp (const reg_syntax_t syntax,
2628                  bitset_t sbcset,
2629                  re_charset_t *mbcset,
2630                  Idx *range_alloc,
2631                  const bracket_elem_t *start_elem,
2632                  const bracket_elem_t *end_elem)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (const reg_syntax_t syntax,
2635                  bitset_t sbcset,
2636                  const bracket_elem_t *start_elem,
2637                  const bracket_elem_t *end_elem)
2638 # endif /* not RE_ENABLE_I18N */
2639 {
2640   unsigned int start_ch, end_ch;
2641   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2642   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2643           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2644           0))
2645     return REG_ERANGE;
2646
2647   /* We can handle no multi character collating elements without libc
2648      support.  */
2649   if (BE ((start_elem->type == COLL_SYM
2650            && strlen ((char *) start_elem->opr.name) > 1)
2651           || (end_elem->type == COLL_SYM
2652               && strlen ((char *) end_elem->opr.name) > 1), 0))
2653     return REG_ECOLLATE;
2654
2655 # ifdef RE_ENABLE_I18N
2656   {
2657     wchar_t wc;
2658     wint_t start_wc;
2659     wint_t end_wc;
2660     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2661
2662     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2663                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2664                    : 0));
2665     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2666               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2667                  : 0));
2668     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2669                 ? __btowc (start_ch) : start_elem->opr.wch);
2670     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2671               ? __btowc (end_ch) : end_elem->opr.wch);
2672     if (start_wc == WEOF || end_wc == WEOF)
2673       return REG_ECOLLATE;
2674     cmp_buf[0] = start_wc;
2675     cmp_buf[4] = end_wc;
2676
2677     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2678             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2679       return REG_ERANGE;
2680
2681     /* Got valid collation sequence values, add them as a new entry.
2682        However, for !_LIBC we have no collation elements: if the
2683        character set is single byte, the single byte character set
2684        that we build below suffices.  parse_bracket_exp passes
2685        no MBCSET if dfa->mb_cur_max == 1.  */
2686     if (mbcset)
2687       {
2688         /* Check the space of the arrays.  */
2689         if (BE (*range_alloc == mbcset->nranges, 0))
2690           {
2691             /* There is not enough space, need realloc.  */
2692             wchar_t *new_array_start, *new_array_end;
2693             Idx new_nranges;
2694
2695             /* +1 in case of mbcset->nranges is 0.  */
2696             new_nranges = 2 * mbcset->nranges + 1;
2697             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2698                are NULL if *range_alloc == 0.  */
2699             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2700                                           new_nranges);
2701             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2702                                         new_nranges);
2703
2704             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2705               return REG_ESPACE;
2706
2707             mbcset->range_starts = new_array_start;
2708             mbcset->range_ends = new_array_end;
2709             *range_alloc = new_nranges;
2710           }
2711
2712         mbcset->range_starts[mbcset->nranges] = start_wc;
2713         mbcset->range_ends[mbcset->nranges++] = end_wc;
2714       }
2715
2716     /* Build the table for single byte characters.  */
2717     for (wc = 0; wc < SBC_MAX; ++wc)
2718       {
2719         cmp_buf[2] = wc;
2720         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2721             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2722           bitset_set (sbcset, wc);
2723       }
2724   }
2725 # else /* not RE_ENABLE_I18N */
2726   {
2727     unsigned int ch;
2728     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2729                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2730                    : 0));
2731     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2732               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2733                  : 0));
2734     if (start_ch > end_ch)
2735       return REG_ERANGE;
2736     /* Build the table for single byte characters.  */
2737     for (ch = 0; ch < SBC_MAX; ++ch)
2738       if (start_ch <= ch  && ch <= end_ch)
2739         bitset_set (sbcset, ch);
2740   }
2741 # endif /* not RE_ENABLE_I18N */
2742   return REG_NOERROR;
2743 }
2744 #endif /* not _LIBC */
2745
2746 #ifndef _LIBC
2747 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2748    Build the collating element which is represented by NAME.
2749    The result are written to MBCSET and SBCSET.
2750    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2751    pointer argument since we may update it.  */
2752
2753 static reg_errcode_t
2754 internal_function
2755 build_collating_symbol (bitset_t sbcset,
2756 # ifdef RE_ENABLE_I18N
2757                         re_charset_t *mbcset _UNUSED_PARAMETER_,
2758                         Idx *coll_sym_alloc _UNUSED_PARAMETER_,
2759 # endif
2760                         const unsigned char *name)
2761 {
2762   size_t name_len = strlen ((const char *) name);
2763   if (BE (name_len != 1, 0))
2764     return REG_ECOLLATE;
2765   else
2766     {
2767       bitset_set (sbcset, name[0]);
2768       return REG_NOERROR;
2769     }
2770 }
2771 #endif /* not _LIBC */
2772
2773 /* This function parse bracket expression like "[abc]", "[a-c]",
2774    "[[.a-a.]]" etc.  */
2775
2776 static bin_tree_t *
2777 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2778                    reg_syntax_t syntax, reg_errcode_t *err)
2779 {
2780 #ifdef _LIBC
2781   const unsigned char *collseqmb;
2782   const char *collseqwc;
2783   uint32_t nrules;
2784   int32_t table_size;
2785   const int32_t *symb_table;
2786   const unsigned char *extra;
2787
2788   /* Local function for parse_bracket_exp used in _LIBC environement.
2789      Seek the collating symbol entry correspondings to NAME.
2790      Return the index of the symbol in the SYMB_TABLE.  */
2791
2792   auto inline int32_t
2793   __attribute ((always_inline))
2794   seek_collating_symbol_entry (name, name_len)
2795          const unsigned char *name;
2796          size_t name_len;
2797     {
2798       int32_t hash = elem_hash ((const char *) name, name_len);
2799       int32_t elem = hash % table_size;
2800       if (symb_table[2 * elem] != 0)
2801         {
2802           int32_t second = hash % (table_size - 2) + 1;
2803
2804           do
2805             {
2806               /* First compare the hashing value.  */
2807               if (symb_table[2 * elem] == hash
2808                   /* Compare the length of the name.  */
2809                   && name_len == extra[symb_table[2 * elem + 1]]
2810                   /* Compare the name.  */
2811                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2812                              name_len) == 0)
2813                 {
2814                   /* Yep, this is the entry.  */
2815                   break;
2816                 }
2817
2818               /* Next entry.  */
2819               elem += second;
2820             }
2821           while (symb_table[2 * elem] != 0);
2822         }
2823       return elem;
2824     }
2825
2826   /* Local function for parse_bracket_exp used in _LIBC environment.
2827      Look up the collation sequence value of BR_ELEM.
2828      Return the value if succeeded, UINT_MAX otherwise.  */
2829
2830   auto inline unsigned int
2831   __attribute ((always_inline))
2832   lookup_collation_sequence_value (br_elem)
2833          bracket_elem_t *br_elem;
2834     {
2835       if (br_elem->type == SB_CHAR)
2836         {
2837           /*
2838           if (MB_CUR_MAX == 1)
2839           */
2840           if (nrules == 0)
2841             return collseqmb[br_elem->opr.ch];
2842           else
2843             {
2844               wint_t wc = __btowc (br_elem->opr.ch);
2845               return __collseq_table_lookup (collseqwc, wc);
2846             }
2847         }
2848       else if (br_elem->type == MB_CHAR)
2849         {
2850           if (nrules != 0)
2851             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2852         }
2853       else if (br_elem->type == COLL_SYM)
2854         {
2855           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2856           if (nrules != 0)
2857             {
2858               int32_t elem, idx;
2859               elem = seek_collating_symbol_entry (br_elem->opr.name,
2860                                                   sym_name_len);
2861               if (symb_table[2 * elem] != 0)
2862                 {
2863                   /* We found the entry.  */
2864                   idx = symb_table[2 * elem + 1];
2865                   /* Skip the name of collating element name.  */
2866                   idx += 1 + extra[idx];
2867                   /* Skip the byte sequence of the collating element.  */
2868                   idx += 1 + extra[idx];
2869                   /* Adjust for the alignment.  */
2870                   idx = (idx + 3) & ~3;
2871                   /* Skip the multibyte collation sequence value.  */
2872                   idx += sizeof (unsigned int);
2873                   /* Skip the wide char sequence of the collating element.  */
2874                   idx += sizeof (unsigned int) *
2875                     (1 + *(unsigned int *) (extra + idx));
2876                   /* Return the collation sequence value.  */
2877                   return *(unsigned int *) (extra + idx);
2878                 }
2879               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2880                 {
2881                   /* No valid character.  Match it as a single byte
2882                      character.  */
2883                   return collseqmb[br_elem->opr.name[0]];
2884                 }
2885             }
2886           else if (sym_name_len == 1)
2887             return collseqmb[br_elem->opr.name[0]];
2888         }
2889       return UINT_MAX;
2890     }
2891
2892   /* Local function for parse_bracket_exp used in _LIBC environement.
2893      Build the range expression which starts from START_ELEM, and ends
2894      at END_ELEM.  The result are written to MBCSET and SBCSET.
2895      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2896      mbcset->range_ends, is a pointer argument sinse we may
2897      update it.  */
2898
2899   auto inline reg_errcode_t
2900   __attribute ((always_inline))
2901   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2902          re_charset_t *mbcset;
2903          Idx *range_alloc;
2904          bitset_t sbcset;
2905          bracket_elem_t *start_elem, *end_elem;
2906     {
2907       unsigned int ch;
2908       uint32_t start_collseq;
2909       uint32_t end_collseq;
2910
2911       /* Equivalence Classes and Character Classes can't be a range
2912          start/end.  */
2913       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2914               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2915               0))
2916         return REG_ERANGE;
2917
2918       start_collseq = lookup_collation_sequence_value (start_elem);
2919       end_collseq = lookup_collation_sequence_value (end_elem);
2920       /* Check start/end collation sequence values.  */
2921       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2922         return REG_ECOLLATE;
2923       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2924         return REG_ERANGE;
2925
2926       /* Got valid collation sequence values, add them as a new entry.
2927          However, if we have no collation elements, and the character set
2928          is single byte, the single byte character set that we
2929          build below suffices. */
2930       if (nrules > 0 || dfa->mb_cur_max > 1)
2931         {
2932           /* Check the space of the arrays.  */
2933           if (BE (*range_alloc == mbcset->nranges, 0))
2934             {
2935               /* There is not enough space, need realloc.  */
2936               uint32_t *new_array_start;
2937               uint32_t *new_array_end;
2938               Idx new_nranges;
2939
2940               /* +1 in case of mbcset->nranges is 0.  */
2941               new_nranges = 2 * mbcset->nranges + 1;
2942               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2943                                             new_nranges);
2944               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2945                                           new_nranges);
2946
2947               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2948                 return REG_ESPACE;
2949
2950               mbcset->range_starts = new_array_start;
2951               mbcset->range_ends = new_array_end;
2952               *range_alloc = new_nranges;
2953             }
2954
2955           mbcset->range_starts[mbcset->nranges] = start_collseq;
2956           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2957         }
2958
2959       /* Build the table for single byte characters.  */
2960       for (ch = 0; ch < SBC_MAX; ch++)
2961         {
2962           uint32_t ch_collseq;
2963           /*
2964           if (MB_CUR_MAX == 1)
2965           */
2966           if (nrules == 0)
2967             ch_collseq = collseqmb[ch];
2968           else
2969             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2970           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2971             bitset_set (sbcset, ch);
2972         }
2973       return REG_NOERROR;
2974     }
2975
2976   /* Local function for parse_bracket_exp used in _LIBC environement.
2977      Build the collating element which is represented by NAME.
2978      The result are written to MBCSET and SBCSET.
2979      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2980      pointer argument sinse we may update it.  */
2981
2982   auto inline reg_errcode_t
2983   __attribute ((always_inline))
2984   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2985          re_charset_t *mbcset;
2986          Idx *coll_sym_alloc;
2987          bitset_t sbcset;
2988          const unsigned char *name;
2989     {
2990       int32_t elem, idx;
2991       size_t name_len = strlen ((const char *) name);
2992       if (nrules != 0)
2993         {
2994           elem = seek_collating_symbol_entry (name, name_len);
2995           if (symb_table[2 * elem] != 0)
2996             {
2997               /* We found the entry.  */
2998               idx = symb_table[2 * elem + 1];
2999               /* Skip the name of collating element name.  */
3000               idx += 1 + extra[idx];
3001             }
3002           else if (symb_table[2 * elem] == 0 && name_len == 1)
3003             {
3004               /* No valid character, treat it as a normal
3005                  character.  */
3006               bitset_set (sbcset, name[0]);
3007               return REG_NOERROR;
3008             }
3009           else
3010             return REG_ECOLLATE;
3011
3012           /* Got valid collation sequence, add it as a new entry.  */
3013           /* Check the space of the arrays.  */
3014           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3015             {
3016               /* Not enough, realloc it.  */
3017               /* +1 in case of mbcset->ncoll_syms is 0.  */
3018               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3019               /* Use realloc since mbcset->coll_syms is NULL
3020                  if *alloc == 0.  */
3021               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3022                                                    new_coll_sym_alloc);
3023               if (BE (new_coll_syms == NULL, 0))
3024                 return REG_ESPACE;
3025               mbcset->coll_syms = new_coll_syms;
3026               *coll_sym_alloc = new_coll_sym_alloc;
3027             }
3028           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3029           return REG_NOERROR;
3030         }
3031       else
3032         {
3033           if (BE (name_len != 1, 0))
3034             return REG_ECOLLATE;
3035           else
3036             {
3037               bitset_set (sbcset, name[0]);
3038               return REG_NOERROR;
3039             }
3040         }
3041     }
3042 #endif
3043
3044   re_token_t br_token;
3045   re_bitset_ptr_t sbcset;
3046 #ifdef RE_ENABLE_I18N
3047   re_charset_t *mbcset;
3048   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3049   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3050 #endif /* not RE_ENABLE_I18N */
3051   bool non_match = false;
3052   bin_tree_t *work_tree;
3053   int token_len;
3054   bool first_round = true;
3055 #ifdef _LIBC
3056   collseqmb = (const unsigned char *)
3057     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3058   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3059   if (nrules)
3060     {
3061       /*
3062       if (MB_CUR_MAX > 1)
3063       */
3064       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3065       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3066       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3067                                                   _NL_COLLATE_SYMB_TABLEMB);
3068       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3069                                                    _NL_COLLATE_SYMB_EXTRAMB);
3070     }
3071 #endif
3072   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3073 #ifdef RE_ENABLE_I18N
3074   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3075 #endif /* RE_ENABLE_I18N */
3076 #ifdef RE_ENABLE_I18N
3077   if (BE (sbcset == NULL || mbcset == NULL, 0))
3078 #else
3079   if (BE (sbcset == NULL, 0))
3080 #endif /* RE_ENABLE_I18N */
3081     {
3082       *err = REG_ESPACE;
3083       return NULL;
3084     }
3085
3086   token_len = peek_token_bracket (token, regexp, syntax);
3087   if (BE (token->type == END_OF_RE, 0))
3088     {
3089       *err = REG_BADPAT;
3090       goto parse_bracket_exp_free_return;
3091     }
3092   if (token->type == OP_NON_MATCH_LIST)
3093     {
3094 #ifdef RE_ENABLE_I18N
3095       mbcset->non_match = 1;
3096 #endif /* not RE_ENABLE_I18N */
3097       non_match = true;
3098       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3099         bitset_set (sbcset, '\n');
3100       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3101       token_len = peek_token_bracket (token, regexp, syntax);
3102       if (BE (token->type == END_OF_RE, 0))
3103         {
3104           *err = REG_BADPAT;
3105           goto parse_bracket_exp_free_return;
3106         }
3107     }
3108
3109   /* We treat the first ']' as a normal character.  */
3110   if (token->type == OP_CLOSE_BRACKET)
3111     token->type = CHARACTER;
3112
3113   while (1)
3114     {
3115       bracket_elem_t start_elem, end_elem;
3116       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3117       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3118       reg_errcode_t ret;
3119       int token_len2 = 0;
3120       bool is_range_exp = false;
3121       re_token_t token2;
3122
3123       start_elem.opr.name = start_name_buf;
3124       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3125                                    syntax, first_round);
3126       if (BE (ret != REG_NOERROR, 0))
3127         {
3128           *err = ret;
3129           goto parse_bracket_exp_free_return;
3130         }
3131       first_round = false;
3132
3133       /* Get information about the next token.  We need it in any case.  */
3134       token_len = peek_token_bracket (token, regexp, syntax);
3135
3136       /* Do not check for ranges if we know they are not allowed.  */
3137       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3138         {
3139           if (BE (token->type == END_OF_RE, 0))
3140             {
3141               *err = REG_EBRACK;
3142               goto parse_bracket_exp_free_return;
3143             }
3144           if (token->type == OP_CHARSET_RANGE)
3145             {
3146               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3147               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3148               if (BE (token2.type == END_OF_RE, 0))
3149                 {
3150                   *err = REG_EBRACK;
3151                   goto parse_bracket_exp_free_return;
3152                 }
3153               if (token2.type == OP_CLOSE_BRACKET)
3154                 {
3155                   /* We treat the last '-' as a normal character.  */
3156                   re_string_skip_bytes (regexp, -token_len);
3157                   token->type = CHARACTER;
3158                 }
3159               else
3160                 is_range_exp = true;
3161             }
3162         }
3163
3164       if (is_range_exp == true)
3165         {
3166           end_elem.opr.name = end_name_buf;
3167           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3168                                        dfa, syntax, true);
3169           if (BE (ret != REG_NOERROR, 0))
3170             {
3171               *err = ret;
3172               goto parse_bracket_exp_free_return;
3173             }
3174
3175           token_len = peek_token_bracket (token, regexp, syntax);
3176
3177 #ifdef _LIBC
3178           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3179                                   &start_elem, &end_elem);
3180 #else
3181 # ifdef RE_ENABLE_I18N
3182           *err = build_range_exp (syntax, sbcset,
3183                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3184                                   &range_alloc, &start_elem, &end_elem);
3185 # else
3186           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3187 # endif
3188 #endif /* RE_ENABLE_I18N */
3189           if (BE (*err != REG_NOERROR, 0))
3190             goto parse_bracket_exp_free_return;
3191         }
3192       else
3193         {
3194           switch (start_elem.type)
3195             {
3196             case SB_CHAR:
3197               bitset_set (sbcset, start_elem.opr.ch);
3198               break;
3199 #ifdef RE_ENABLE_I18N
3200             case MB_CHAR:
3201               /* Check whether the array has enough space.  */
3202               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3203                 {
3204                   wchar_t *new_mbchars;
3205                   /* Not enough, realloc it.  */
3206                   /* +1 in case of mbcset->nmbchars is 0.  */
3207                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3208                   /* Use realloc since array is NULL if *alloc == 0.  */
3209                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3210                                             mbchar_alloc);
3211                   if (BE (new_mbchars == NULL, 0))
3212                     goto parse_bracket_exp_espace;
3213                   mbcset->mbchars = new_mbchars;
3214                 }
3215               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3216               break;
3217 #endif /* RE_ENABLE_I18N */
3218             case EQUIV_CLASS:
3219               *err = build_equiv_class (sbcset,
3220 #ifdef RE_ENABLE_I18N
3221                                         mbcset, &equiv_class_alloc,
3222 #endif /* RE_ENABLE_I18N */
3223                                         start_elem.opr.name);
3224               if (BE (*err != REG_NOERROR, 0))
3225                 goto parse_bracket_exp_free_return;
3226               break;
3227             case COLL_SYM:
3228               *err = build_collating_symbol (sbcset,
3229 #ifdef RE_ENABLE_I18N
3230                                              mbcset, &coll_sym_alloc,
3231 #endif /* RE_ENABLE_I18N */
3232                                              start_elem.opr.name);
3233               if (BE (*err != REG_NOERROR, 0))
3234                 goto parse_bracket_exp_free_return;
3235               break;
3236             case CHAR_CLASS:
3237               *err = build_charclass (regexp->trans, sbcset,
3238 #ifdef RE_ENABLE_I18N
3239                                       mbcset, &char_class_alloc,
3240 #endif /* RE_ENABLE_I18N */
3241                                       start_elem.opr.name, syntax);
3242               if (BE (*err != REG_NOERROR, 0))
3243                goto parse_bracket_exp_free_return;
3244               break;
3245             default:
3246               assert (0);
3247               break;
3248             }
3249         }
3250       if (BE (token->type == END_OF_RE, 0))
3251         {
3252           *err = REG_EBRACK;
3253           goto parse_bracket_exp_free_return;
3254         }
3255       if (token->type == OP_CLOSE_BRACKET)
3256         break;
3257     }
3258
3259   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3260
3261   /* If it is non-matching list.  */
3262   if (non_match)
3263     bitset_not (sbcset);
3264
3265 #ifdef RE_ENABLE_I18N
3266   /* Ensure only single byte characters are set.  */
3267   if (dfa->mb_cur_max > 1)
3268     bitset_mask (sbcset, dfa->sb_char);
3269
3270   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3271       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3272                                                      || mbcset->non_match)))
3273     {
3274       bin_tree_t *mbc_tree;
3275       int sbc_idx;
3276       /* Build a tree for complex bracket.  */
3277       dfa->has_mb_node = 1;
3278       br_token.type = COMPLEX_BRACKET;
3279       br_token.opr.mbcset = mbcset;
3280       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3281       if (BE (mbc_tree == NULL, 0))
3282         goto parse_bracket_exp_espace;
3283       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3284         if (sbcset[sbc_idx])
3285           break;
3286       /* If there are no bits set in sbcset, there is no point
3287          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3288       if (sbc_idx < BITSET_WORDS)
3289         {
3290           /* Build a tree for simple bracket.  */
3291           br_token.type = SIMPLE_BRACKET;
3292           br_token.opr.sbcset = sbcset;
3293           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3294           if (BE (work_tree == NULL, 0))
3295             goto parse_bracket_exp_espace;
3296
3297           /* Then join them by ALT node.  */
3298           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3299           if (BE (work_tree == NULL, 0))
3300             goto parse_bracket_exp_espace;
3301         }
3302       else
3303         {
3304           re_free (sbcset);
3305           work_tree = mbc_tree;
3306         }
3307     }
3308   else
3309 #endif /* not RE_ENABLE_I18N */
3310     {
3311 #ifdef RE_ENABLE_I18N
3312       free_charset (mbcset);
3313 #endif
3314       /* Build a tree for simple bracket.  */
3315       br_token.type = SIMPLE_BRACKET;
3316       br_token.opr.sbcset = sbcset;
3317       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3318       if (BE (work_tree == NULL, 0))
3319         goto parse_bracket_exp_espace;
3320     }
3321   return work_tree;
3322
3323  parse_bracket_exp_espace:
3324   *err = REG_ESPACE;
3325  parse_bracket_exp_free_return:
3326   re_free (sbcset);
3327 #ifdef RE_ENABLE_I18N
3328   free_charset (mbcset);
3329 #endif /* RE_ENABLE_I18N */
3330   return NULL;
3331 }
3332
3333 /* Parse an element in the bracket expression.  */
3334
3335 static reg_errcode_t
3336 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3337                        re_token_t *token, int token_len,
3338                        re_dfa_t *dfa _UNUSED_PARAMETER_,
3339                        reg_syntax_t syntax, bool accept_hyphen)
3340 {
3341 #ifdef RE_ENABLE_I18N
3342   int cur_char_size;
3343   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3344   if (cur_char_size > 1)
3345     {
3346       elem->type = MB_CHAR;
3347       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3348       re_string_skip_bytes (regexp, cur_char_size);
3349       return REG_NOERROR;
3350     }
3351 #endif /* RE_ENABLE_I18N */
3352   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3353   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3354       || token->type == OP_OPEN_EQUIV_CLASS)
3355     return parse_bracket_symbol (elem, regexp, token);
3356   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3357     {
3358       /* A '-' must only appear as anything but a range indicator before
3359          the closing bracket.  Everything else is an error.  */
3360       re_token_t token2;
3361       (void) peek_token_bracket (&token2, regexp, syntax);
3362       if (token2.type != OP_CLOSE_BRACKET)
3363         /* The actual error value is not standardized since this whole
3364            case is undefined.  But ERANGE makes good sense.  */
3365         return REG_ERANGE;
3366     }
3367   elem->type = SB_CHAR;
3368   elem->opr.ch = token->opr.c;
3369   return REG_NOERROR;
3370 }
3371
3372 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3373    such as [:<character_class>:], [.<collating_element>.], and
3374    [=<equivalent_class>=].  */
3375
3376 static reg_errcode_t
3377 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3378                       re_token_t *token)
3379 {
3380   unsigned char ch, delim = token->opr.c;
3381   int i = 0;
3382   if (re_string_eoi(regexp))
3383     return REG_EBRACK;
3384   for (;; ++i)
3385     {
3386       if (i >= BRACKET_NAME_BUF_SIZE)
3387         return REG_EBRACK;
3388       if (token->type == OP_OPEN_CHAR_CLASS)
3389         ch = re_string_fetch_byte_case (regexp);
3390       else
3391         ch = re_string_fetch_byte (regexp);
3392       if (re_string_eoi(regexp))
3393         return REG_EBRACK;
3394       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3395         break;
3396       elem->opr.name[i] = ch;
3397     }
3398   re_string_skip_bytes (regexp, 1);
3399   elem->opr.name[i] = '\0';
3400   switch (token->type)
3401     {
3402     case OP_OPEN_COLL_ELEM:
3403       elem->type = COLL_SYM;
3404       break;
3405     case OP_OPEN_EQUIV_CLASS:
3406       elem->type = EQUIV_CLASS;
3407       break;
3408     case OP_OPEN_CHAR_CLASS:
3409       elem->type = CHAR_CLASS;
3410       break;
3411     default:
3412       break;
3413     }
3414   return REG_NOERROR;
3415 }
3416
3417   /* Helper function for parse_bracket_exp.
3418      Build the equivalence class which is represented by NAME.
3419      The result are written to MBCSET and SBCSET.
3420      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3421      is a pointer argument sinse we may update it.  */
3422
3423 static reg_errcode_t
3424 #ifdef RE_ENABLE_I18N
3425 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset _UNUSED_PARAMETER_,
3426                    Idx *equiv_class_alloc _UNUSED_PARAMETER_,
3427                    const unsigned char *name)
3428 #else /* not RE_ENABLE_I18N */
3429 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3430 #endif /* not RE_ENABLE_I18N */
3431 {
3432 #ifdef _LIBC
3433   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3434   if (nrules != 0)
3435     {
3436       const int32_t *table, *indirect;
3437       const unsigned char *weights, *extra, *cp;
3438       unsigned char char_buf[2];
3439       int32_t idx1, idx2;
3440       unsigned int ch;
3441       size_t len;
3442       /* This #include defines a local function!  */
3443 # include <locale/weight.h>
3444       /* Calculate the index for equivalence class.  */
3445       cp = name;
3446       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3447       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3448                                                _NL_COLLATE_WEIGHTMB);
3449       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3450                                                    _NL_COLLATE_EXTRAMB);
3451       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3452                                                 _NL_COLLATE_INDIRECTMB);
3453       idx1 = findidx (&cp);
3454       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3455         /* This isn't a valid character.  */
3456         return REG_ECOLLATE;
3457
3458       /* Build single byte matcing table for this equivalence class.  */
3459       char_buf[1] = (unsigned char) '\0';
3460       len = weights[idx1 & 0xffffff];
3461       for (ch = 0; ch < SBC_MAX; ++ch)
3462         {
3463           char_buf[0] = ch;
3464           cp = char_buf;
3465           idx2 = findidx (&cp);
3466 /*
3467           idx2 = table[ch];
3468 */
3469           if (idx2 == 0)
3470             /* This isn't a valid character.  */
3471             continue;
3472           /* Compare only if the length matches and the collation rule
3473              index is the same.  */
3474           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3475             {
3476               int cnt = 0;
3477
3478               while (cnt <= len &&
3479                      weights[(idx1 & 0xffffff) + 1 + cnt]
3480                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3481                 ++cnt;
3482
3483               if (cnt > len)
3484                 bitset_set (sbcset, ch);
3485             }
3486         }
3487       /* Check whether the array has enough space.  */
3488       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3489         {
3490           /* Not enough, realloc it.  */
3491           /* +1 in case of mbcset->nequiv_classes is 0.  */
3492           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3493           /* Use realloc since the array is NULL if *alloc == 0.  */
3494           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3495                                                    int32_t,
3496                                                    new_equiv_class_alloc);
3497           if (BE (new_equiv_classes == NULL, 0))
3498             return REG_ESPACE;
3499           mbcset->equiv_classes = new_equiv_classes;
3500           *equiv_class_alloc = new_equiv_class_alloc;
3501         }
3502       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3503     }
3504   else
3505 #endif /* _LIBC */
3506     {
3507       if (BE (strlen ((const char *) name) != 1, 0))
3508         return REG_ECOLLATE;
3509       bitset_set (sbcset, *name);
3510     }
3511   return REG_NOERROR;
3512 }
3513
3514   /* Helper function for parse_bracket_exp.
3515      Build the character class which is represented by NAME.
3516      The result are written to MBCSET and SBCSET.
3517      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3518      is a pointer argument sinse we may update it.  */
3519
3520 static reg_errcode_t
3521 #ifdef RE_ENABLE_I18N
3522 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3523                  re_charset_t *mbcset, Idx *char_class_alloc,
3524                  const unsigned char *class_name, reg_syntax_t syntax)
3525 #else /* not RE_ENABLE_I18N */
3526 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3527                  const unsigned char *class_name, reg_syntax_t syntax)
3528 #endif /* not RE_ENABLE_I18N */
3529 {
3530   int i;
3531   const char *name = (const char *) class_name;
3532
3533   /* In case of REG_ICASE "upper" and "lower" match the both of
3534      upper and lower cases.  */
3535   if ((syntax & RE_ICASE)
3536       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3537     name = "alpha";
3538
3539 #ifdef RE_ENABLE_I18N
3540   /* Check the space of the arrays.  */
3541   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3542     {
3543       /* Not enough, realloc it.  */
3544       /* +1 in case of mbcset->nchar_classes is 0.  */
3545       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3546       /* Use realloc since array is NULL if *alloc == 0.  */
3547       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3548                                                new_char_class_alloc);
3549       if (BE (new_char_classes == NULL, 0))
3550         return REG_ESPACE;
3551       mbcset->char_classes = new_char_classes;
3552       *char_class_alloc = new_char_class_alloc;
3553     }
3554   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3555 #endif /* RE_ENABLE_I18N */
3556
3557 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3558   do {                                          \
3559     if (BE (trans != NULL, 0))                  \
3560       {                                         \
3561         for (i = 0; i < SBC_MAX; ++i)           \
3562           if (ctype_func (i))                   \
3563             bitset_set (sbcset, trans[i]);      \
3564       }                                         \
3565     else                                        \
3566       {                                         \
3567         for (i = 0; i < SBC_MAX; ++i)           \
3568           if (ctype_func (i))                   \
3569             bitset_set (sbcset, i);             \
3570       }                                         \
3571   } while (0)
3572
3573   if (strcmp (name, "alnum") == 0)
3574     BUILD_CHARCLASS_LOOP (isalnum);
3575   else if (strcmp (name, "cntrl") == 0)
3576     BUILD_CHARCLASS_LOOP (iscntrl);
3577   else if (strcmp (name, "lower") == 0)
3578     BUILD_CHARCLASS_LOOP (islower);
3579   else if (strcmp (name, "space") == 0)
3580     BUILD_CHARCLASS_LOOP (isspace);
3581   else if (strcmp (name, "alpha") == 0)
3582     BUILD_CHARCLASS_LOOP (isalpha);
3583   else if (strcmp (name, "digit") == 0)
3584     BUILD_CHARCLASS_LOOP (isdigit);
3585   else if (strcmp (name, "print") == 0)
3586     BUILD_CHARCLASS_LOOP (isprint);
3587   else if (strcmp (name, "upper") == 0)
3588     BUILD_CHARCLASS_LOOP (isupper);
3589   else if (strcmp (name, "blank") == 0)
3590     BUILD_CHARCLASS_LOOP (isblank);
3591   else if (strcmp (name, "graph") == 0)
3592     BUILD_CHARCLASS_LOOP (isgraph);
3593   else if (strcmp (name, "punct") == 0)
3594     BUILD_CHARCLASS_LOOP (ispunct);
3595   else if (strcmp (name, "xdigit") == 0)
3596     BUILD_CHARCLASS_LOOP (isxdigit);
3597   else
3598     return REG_ECTYPE;
3599
3600   return REG_NOERROR;
3601 }
3602
3603 static bin_tree_t *
3604 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3605                     const unsigned char *class_name,
3606                     const unsigned char *extra, bool non_match,
3607                     reg_errcode_t *err)
3608 {
3609   re_bitset_ptr_t sbcset;
3610 #ifdef RE_ENABLE_I18N
3611   re_charset_t *mbcset;
3612   Idx alloc = 0;
3613 #endif /* not RE_ENABLE_I18N */
3614   reg_errcode_t ret;
3615   re_token_t br_token;
3616   bin_tree_t *tree;
3617
3618   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3619 #ifdef RE_ENABLE_I18N
3620   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3621 #endif /* RE_ENABLE_I18N */
3622
3623 #ifdef RE_ENABLE_I18N
3624   if (BE (sbcset == NULL || mbcset == NULL, 0))
3625 #else /* not RE_ENABLE_I18N */
3626   if (BE (sbcset == NULL, 0))
3627 #endif /* not RE_ENABLE_I18N */
3628     {
3629       *err = REG_ESPACE;
3630       return NULL;
3631     }
3632
3633   if (non_match)
3634     {
3635 #ifdef RE_ENABLE_I18N
3636       mbcset->non_match = 1;
3637 #endif /* not RE_ENABLE_I18N */
3638     }
3639
3640   /* We don't care the syntax in this case.  */
3641   ret = build_charclass (trans, sbcset,
3642 #ifdef RE_ENABLE_I18N
3643                          mbcset, &alloc,
3644 #endif /* RE_ENABLE_I18N */
3645                          class_name, 0);
3646
3647   if (BE (ret != REG_NOERROR, 0))
3648     {
3649       re_free (sbcset);
3650 #ifdef RE_ENABLE_I18N
3651       free_charset (mbcset);
3652 #endif /* RE_ENABLE_I18N */
3653       *err = ret;
3654       return NULL;
3655     }
3656   /* \w match '_' also.  */
3657   for (; *extra; extra++)
3658     bitset_set (sbcset, *extra);
3659
3660   /* If it is non-matching list.  */
3661   if (non_match)
3662     bitset_not (sbcset);
3663
3664 #ifdef RE_ENABLE_I18N
3665   /* Ensure only single byte characters are set.  */
3666   if (dfa->mb_cur_max > 1)
3667     bitset_mask (sbcset, dfa->sb_char);
3668 #endif
3669
3670   /* Build a tree for simple bracket.  */
3671   br_token.type = SIMPLE_BRACKET;
3672   br_token.opr.sbcset = sbcset;
3673   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3674   if (BE (tree == NULL, 0))
3675     goto build_word_op_espace;
3676
3677 #ifdef RE_ENABLE_I18N
3678   if (dfa->mb_cur_max > 1)
3679     {
3680       bin_tree_t *mbc_tree;
3681       /* Build a tree for complex bracket.  */
3682       br_token.type = COMPLEX_BRACKET;
3683       br_token.opr.mbcset = mbcset;
3684       dfa->has_mb_node = 1;
3685       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3686       if (BE (mbc_tree == NULL, 0))
3687         goto build_word_op_espace;
3688       /* Then join them by ALT node.  */
3689       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3690       if (BE (mbc_tree != NULL, 1))
3691         return tree;
3692     }
3693   else
3694     {
3695       free_charset (mbcset);
3696       return tree;
3697     }
3698 #else /* not RE_ENABLE_I18N */
3699   return tree;
3700 #endif /* not RE_ENABLE_I18N */
3701
3702  build_word_op_espace:
3703   re_free (sbcset);
3704 #ifdef RE_ENABLE_I18N
3705   free_charset (mbcset);
3706 #endif /* RE_ENABLE_I18N */
3707   *err = REG_ESPACE;
3708   return NULL;
3709 }
3710
3711 /* This is intended for the expressions like "a{1,3}".
3712    Fetch a number from `input', and return the number.
3713    Return REG_MISSING if the number field is empty like "{,1}".
3714    Return REG_ERROR if an error occurred.  */
3715
3716 static Idx
3717 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3718 {
3719   Idx num = REG_MISSING;
3720   unsigned char c;
3721   while (1)
3722     {
3723       fetch_token (token, input, syntax);
3724       c = token->opr.c;
3725       if (BE (token->type == END_OF_RE, 0))
3726         return REG_ERROR;
3727       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3728         break;
3729       num = ((token->type != CHARACTER || c < '0' || '9' < c
3730               || num == REG_ERROR)
3731              ? REG_ERROR
3732              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3733       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3734     }
3735   return num;
3736 }
3737 \f
3738 #ifdef RE_ENABLE_I18N
3739 static void
3740 free_charset (re_charset_t *cset)
3741 {
3742   re_free (cset->mbchars);
3743 # ifdef _LIBC
3744   re_free (cset->coll_syms);
3745   re_free (cset->equiv_classes);
3746   re_free (cset->range_starts);
3747   re_free (cset->range_ends);
3748 # endif
3749   re_free (cset->char_classes);
3750   re_free (cset);
3751 }
3752 #endif /* RE_ENABLE_I18N */
3753 \f
3754 /* Functions for binary tree operation.  */
3755
3756 /* Create a tree node.  */
3757
3758 static bin_tree_t *
3759 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3760              re_token_type_t type)
3761 {
3762   re_token_t t;
3763   t.type = type;
3764   return create_token_tree (dfa, left, right, &t);
3765 }
3766
3767 static bin_tree_t *
3768 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3769                    const re_token_t *token)
3770 {
3771   bin_tree_t *tree;
3772   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3773     {
3774       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3775
3776       if (storage == NULL)
3777         return NULL;
3778       storage->next = dfa->str_tree_storage;
3779       dfa->str_tree_storage = storage;
3780       dfa->str_tree_storage_idx = 0;
3781     }
3782   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3783
3784   tree->parent = NULL;
3785   tree->left = left;
3786   tree->right = right;
3787   tree->token = *token;
3788   tree->token.duplicated = 0;
3789   tree->token.opt_subexp = 0;
3790   tree->first = NULL;
3791   tree->next = NULL;
3792   tree->node_idx = REG_MISSING;
3793
3794   if (left != NULL)
3795     left->parent = tree;
3796   if (right != NULL)
3797     right->parent = tree;
3798   return tree;
3799 }
3800
3801 /* Mark the tree SRC as an optional subexpression.
3802    To be called from preorder or postorder.  */
3803
3804 static reg_errcode_t
3805 mark_opt_subexp (void *extra, bin_tree_t *node)
3806 {
3807   Idx idx = (Idx) (long) extra;
3808   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3809     node->token.opt_subexp = 1;
3810
3811   return REG_NOERROR;
3812 }
3813
3814 /* Free the allocated memory inside NODE. */
3815
3816 static void
3817 free_token (re_token_t *node)
3818 {
3819 #ifdef RE_ENABLE_I18N
3820   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3821     free_charset (node->opr.mbcset);
3822   else
3823 #endif /* RE_ENABLE_I18N */
3824     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3825       re_free (node->opr.sbcset);
3826 }
3827
3828 /* Worker function for tree walking.  Free the allocated memory inside NODE
3829    and its children. */
3830
3831 static reg_errcode_t
3832 free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
3833 {
3834   free_token (&node->token);
3835   return REG_NOERROR;
3836 }
3837
3838
3839 /* Duplicate the node SRC, and return new node.  This is a preorder
3840    visit similar to the one implemented by the generic visitor, but
3841    we need more infrastructure to maintain two parallel trees --- so,
3842    it's easier to duplicate.  */
3843
3844 static bin_tree_t *
3845 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3846 {
3847   const bin_tree_t *node;
3848   bin_tree_t *dup_root;
3849   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3850
3851   for (node = root; ; )
3852     {
3853       /* Create a new tree and link it back to the current parent.  */
3854       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3855       if (*p_new == NULL)
3856         return NULL;
3857       (*p_new)->parent = dup_node;
3858       (*p_new)->token.duplicated = 1;
3859       dup_node = *p_new;
3860
3861       /* Go to the left node, or up and to the right.  */
3862       if (node->left)
3863         {
3864           node = node->left;
3865           p_new = &dup_node->left;
3866         }
3867       else
3868         {
3869           const bin_tree_t *prev = NULL;
3870           while (node->right == prev || node->right == NULL)
3871             {
3872               prev = node;
3873               node = node->parent;
3874               dup_node = dup_node->parent;
3875               if (!node)
3876                 return dup_root;
3877             }
3878           node = node->right;
3879           p_new = &dup_node->right;
3880         }
3881     }
3882 }