contrib/expat/lib/xmltok.c

   1 /*
   2                             __  __            _
   3                          ___\ \/ /_ __   __ _| |_
   4                         / _ \\  /| '_ \ / _` | __|
   5                        |  __//  \| |_) | (_| | |_
   6                         \___/_/\_\ .__/ \__,_|\__|
   7                                  |_| XML parser
   8
   9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
  10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
  11    Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
  12    Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
  13    Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
  14    Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
  15    Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
  16    Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
  17    Copyright (c) 2016      Don Lewis <truckman@apache.org>
  18    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
  19    Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
  20    Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
  21    Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
  22    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
  23    Copyright (c) 2021      Dong-hee Na <donghee.na@python.org>
  24    Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
  25    Licensed under the MIT license:
  26
  27    Permission is  hereby granted,  free of charge,  to any  person obtaining
  28    a  copy  of  this  software   and  associated  documentation  files  (the
  29    "Software"),  to  deal in  the  Software  without restriction,  including
  30    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
  31    distribute, sublicense, and/or sell copies of the Software, and to permit
  32    persons  to whom  the Software  is  furnished to  do so,  subject to  the
  33    following conditions:
  34
  35    The above copyright  notice and this permission notice  shall be included
  36    in all copies or substantial portions of the Software.
  37
  38    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
  39    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
  40    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
  41    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
  42    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
  43    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  44    USE OR OTHER DEALINGS IN THE SOFTWARE.
  45 */
  46
  47 #include <expat_config.h>
  48
  49 #include <stddef.h>
  50 #include <string.h> /* memcpy */
  51 #include <stdbool.h>
  52
  53 #ifdef _WIN32
  54 #  include "winconfig.h"
  55 #endif
  56
  57 #include "expat_external.h"
  58 #include "internal.h"
  59 #include "xmltok.h"
  60 #include "nametab.h"
  61
  62 #ifdef XML_DTD
  63 #  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
  64 #else
  65 #  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
  66 #endif
  67
  68 #define VTABLE1                                                                \
  69   {PREFIX(prologTok), PREFIX(contentTok),                                      \
  70    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
  71       {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
  72       PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
  73       PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
  74       PREFIX(updatePosition), PREFIX(isPublicId)
  75
  76 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
  77
  78 #define UCS2_GET_NAMING(pages, hi, lo)                                         \
  79   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
  80
  81 /* A 2 byte UTF-8 representation splits the characters 11 bits between
  82    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
  83    pages, 3 bits to add to that index and 5 bits to generate the mask.
  84 */
  85 #define UTF8_GET_NAMING2(pages, byte)                                          \
  86   (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
  87                 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
  88    & (1u << (((byte)[1]) & 0x1F)))
  89
  90 /* A 3 byte UTF-8 representation splits the characters 16 bits between
  91    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
  92    into pages, 3 bits to add to that index and 5 bits to generate the
  93    mask.
  94 */
  95 #define UTF8_GET_NAMING3(pages, byte)                                          \
  96   (namingBitmap                                                                \
  97        [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
  98          << 3)                                                                 \
  99         + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
 100    & (1u << (((byte)[2]) & 0x1F)))
 101
 102 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
 103    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
 104    with the additional restriction of not allowing the Unicode
 105    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
 106    Implementation details:
 107      (A & 0x80) == 0     means A < 0x80
 108    and
 109      (A & 0xC0) == 0xC0  means A > 0xBF
 110 */
 111
 112 #define UTF8_INVALID2(p)                                                       \
 113   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
 114
 115 #define UTF8_INVALID3(p)                                                       \
 116   (((p)[2] & 0x80) == 0                                                        \
 117    || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
 118                                       : ((p)[2] & 0xC0) == 0xC0)               \
 119    || ((*p) == 0xE0                                                            \
 120            ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
 121            : ((p)[1] & 0x80) == 0                                              \
 122                  || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
 123
 124 #define UTF8_INVALID4(p)                                                       \
 125   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
 126    || ((p)[2] & 0xC0) == 0xC0                                                  \
 127    || ((*p) == 0xF0                                                            \
 128            ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
 129            : ((p)[1] & 0x80) == 0                                              \
 130                  || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
 131
 132 static int PTRFASTCALL
 133 isNever(const ENCODING *enc, const char *p) {
 134   UNUSED_P(enc);
 135   UNUSED_P(p);
 136   return 0;
 137 }
 138
 139 static int PTRFASTCALL
 140 utf8_isName2(const ENCODING *enc, const char *p) {
 141   UNUSED_P(enc);
 142   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
 143 }
 144
 145 static int PTRFASTCALL
 146 utf8_isName3(const ENCODING *enc, const char *p) {
 147   UNUSED_P(enc);
 148   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
 149 }
 150
 151 #define utf8_isName4 isNever
 152
 153 static int PTRFASTCALL
 154 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
 155   UNUSED_P(enc);
 156   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
 157 }
 158
 159 static int PTRFASTCALL
 160 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
 161   UNUSED_P(enc);
 162   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
 163 }
 164
 165 #define utf8_isNmstrt4 isNever
 166
 167 static int PTRFASTCALL
 168 utf8_isInvalid2(const ENCODING *enc, const char *p) {
 169   UNUSED_P(enc);
 170   return UTF8_INVALID2((const unsigned char *)p);
 171 }
 172
 173 static int PTRFASTCALL
 174 utf8_isInvalid3(const ENCODING *enc, const char *p) {
 175   UNUSED_P(enc);
 176   return UTF8_INVALID3((const unsigned char *)p);
 177 }
 178
 179 static int PTRFASTCALL
 180 utf8_isInvalid4(const ENCODING *enc, const char *p) {
 181   UNUSED_P(enc);
 182   return UTF8_INVALID4((const unsigned char *)p);
 183 }
 184
 185 struct normal_encoding {
 186   ENCODING enc;
 187   unsigned char type[256];
 188 #ifdef XML_MIN_SIZE
 189   int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
 190   int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
 191   int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
 192   int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
 193   int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
 194 #endif /* XML_MIN_SIZE */
 195   int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
 196   int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
 197   int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
 198   int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
 199   int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
 200   int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
 201   int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
 202   int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
 203   int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
 204 };
 205
 206 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
 207
 208 #ifdef XML_MIN_SIZE
 209
 210 #  define STANDARD_VTABLE(E)                                                   \
 211     E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
 212
 213 #else
 214
 215 #  define STANDARD_VTABLE(E) /* as nothing */
 216
 217 #endif
 218
 219 #define NORMAL_VTABLE(E)                                                       \
 220   E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
 221       E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
 222
 223 #define NULL_VTABLE                                                            \
 224   /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
 225       /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
 226       /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
 227
 228 static int FASTCALL checkCharRefNumber(int);
 229
 230 #include "xmltok_impl.h"
 231 #include "ascii.h"
 232
 233 #ifdef XML_MIN_SIZE
 234 #  define sb_isNameMin isNever
 235 #  define sb_isNmstrtMin isNever
 236 #endif
 237
 238 #ifdef XML_MIN_SIZE
 239 #  define MINBPC(enc) ((enc)->minBytesPerChar)
 240 #else
 241 /* minimum bytes per character */
 242 #  define MINBPC(enc) 1
 243 #endif
 244
 245 #define SB_BYTE_TYPE(enc, p)                                                   \
 246   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
 247
 248 #ifdef XML_MIN_SIZE
 249 static int PTRFASTCALL
 250 sb_byteType(const ENCODING *enc, const char *p) {
 251   return SB_BYTE_TYPE(enc, p);
 252 }
 253 #  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
 254 #else
 255 #  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
 256 #endif
 257
 258 #ifdef XML_MIN_SIZE
 259 #  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
 260 static int PTRFASTCALL
 261 sb_byteToAscii(const ENCODING *enc, const char *p) {
 262   UNUSED_P(enc);
 263   return *p;
 264 }
 265 #else
 266 #  define BYTE_TO_ASCII(enc, p) (*(p))
 267 #endif
 268
 269 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
 270 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
 271 #ifdef XML_MIN_SIZE
 272 #  define IS_INVALID_CHAR(enc, p, n)                                           \
 273     (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
 274      && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
 275 #else
 276 #  define IS_INVALID_CHAR(enc, p, n)                                           \
 277     (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
 278 #endif
 279
 280 #ifdef XML_MIN_SIZE
 281 #  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
 282     (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
 283 #  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
 284     (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
 285 #else
 286 #  define IS_NAME_CHAR_MINBPC(enc, p) (0)
 287 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
 288 #endif
 289
 290 #ifdef XML_MIN_SIZE
 291 #  define CHAR_MATCHES(enc, p, c)                                              \
 292     (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
 293 static int PTRCALL
 294 sb_charMatches(const ENCODING *enc, const char *p, int c) {
 295   UNUSED_P(enc);
 296   return *p == c;
 297 }
 298 #else
 299 /* c is an ASCII character */
 300 #  define CHAR_MATCHES(enc, p, c) (*(p) == (c))
 301 #endif
 302
 303 #define PREFIX(ident) normal_##ident
 304 #define XML_TOK_IMPL_C
 305 #include "xmltok_impl.c"
 306 #undef XML_TOK_IMPL_C
 307
 308 #undef MINBPC
 309 #undef BYTE_TYPE
 310 #undef BYTE_TO_ASCII
 311 #undef CHAR_MATCHES
 312 #undef IS_NAME_CHAR
 313 #undef IS_NAME_CHAR_MINBPC
 314 #undef IS_NMSTRT_CHAR
 315 #undef IS_NMSTRT_CHAR_MINBPC
 316 #undef IS_INVALID_CHAR
 317
 318 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
 319        UTF8_cval1 = 0x00,
 320        UTF8_cval2 = 0xc0,
 321        UTF8_cval3 = 0xe0,
 322        UTF8_cval4 = 0xf0
 323 };
 324
 325 void
 326 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
 327                                            const char **fromLimRef) {
 328   const char *fromLim = *fromLimRef;
 329   size_t walked = 0;
 330   for (; fromLim > from; fromLim--, walked++) {
 331     const unsigned char prev = (unsigned char)fromLim[-1];
 332     if ((prev & 0xf8u)
 333         == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
 334       if (walked + 1 >= 4) {
 335         fromLim += 4 - 1;
 336         break;
 337       } else {
 338         walked = 0;
 339       }
 340     } else if ((prev & 0xf0u)
 341                == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
 342       if (walked + 1 >= 3) {
 343         fromLim += 3 - 1;
 344         break;
 345       } else {
 346         walked = 0;
 347       }
 348     } else if ((prev & 0xe0u)
 349                == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
 350       if (walked + 1 >= 2) {
 351         fromLim += 2 - 1;
 352         break;
 353       } else {
 354         walked = 0;
 355       }
 356     } else if ((prev & 0x80u)
 357                == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
 358       break;
 359     }
 360   }
 361   *fromLimRef = fromLim;
 362 }
 363
 364 static enum XML_Convert_Result PTRCALL
 365 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
 366             char **toP, const char *toLim) {
 367   bool input_incomplete = false;
 368   bool output_exhausted = false;
 369
 370   /* Avoid copying partial characters (due to limited space). */
 371   const ptrdiff_t bytesAvailable = fromLim - *fromP;
 372   const ptrdiff_t bytesStorable = toLim - *toP;
 373   UNUSED_P(enc);
 374   if (bytesAvailable > bytesStorable) {
 375     fromLim = *fromP + bytesStorable;
 376     output_exhausted = true;
 377   }
 378
 379   /* Avoid copying partial characters (from incomplete input). */
 380   {
 381     const char *const fromLimBefore = fromLim;
 382     _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
 383     if (fromLim < fromLimBefore) {
 384       input_incomplete = true;
 385     }
 386   }
 387
 388   {
 389     const ptrdiff_t bytesToCopy = fromLim - *fromP;
 390     memcpy(*toP, *fromP, bytesToCopy);
 391     *fromP += bytesToCopy;
 392     *toP += bytesToCopy;
 393   }
 394
 395   if (output_exhausted) /* needs to go first */
 396     return XML_CONVERT_OUTPUT_EXHAUSTED;
 397   else if (input_incomplete)
 398     return XML_CONVERT_INPUT_INCOMPLETE;
 399   else
 400     return XML_CONVERT_COMPLETED;
 401 }
 402
 403 static enum XML_Convert_Result PTRCALL
 404 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
 405              unsigned short **toP, const unsigned short *toLim) {
 406   enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
 407   unsigned short *to = *toP;
 408   const char *from = *fromP;
 409   while (from < fromLim && to < toLim) {
 410     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
 411     case BT_LEAD2:
 412       if (fromLim - from < 2) {
 413         res = XML_CONVERT_INPUT_INCOMPLETE;
 414         goto after;
 415       }
 416       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
 417       from += 2;
 418       break;
 419     case BT_LEAD3:
 420       if (fromLim - from < 3) {
 421         res = XML_CONVERT_INPUT_INCOMPLETE;
 422         goto after;
 423       }
 424       *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
 425                                | (from[2] & 0x3f));
 426       from += 3;
 427       break;
 428     case BT_LEAD4: {
 429       unsigned long n;
 430       if (toLim - to < 2) {
 431         res = XML_CONVERT_OUTPUT_EXHAUSTED;
 432         goto after;
 433       }
 434       if (fromLim - from < 4) {
 435         res = XML_CONVERT_INPUT_INCOMPLETE;
 436         goto after;
 437       }
 438       n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
 439           | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
 440       n -= 0x10000;
 441       to[0] = (unsigned short)((n >> 10) | 0xD800);
 442       to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
 443       to += 2;
 444       from += 4;
 445     } break;
 446     default:
 447       *to++ = *from++;
 448       break;
 449     }
 450   }
 451   if (from < fromLim)
 452     res = XML_CONVERT_OUTPUT_EXHAUSTED;
 453 after:
 454   *fromP = from;
 455   *toP = to;
 456   return res;
 457 }
 458
 459 #ifdef XML_NS
 460 static const struct normal_encoding utf8_encoding_ns
 461     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
 462        {
 463 #  include "asciitab.h"
 464 #  include "utf8tab.h"
 465        },
 466        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
 467 #endif
 468
 469 static const struct normal_encoding utf8_encoding
 470     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
 471        {
 472 #define BT_COLON BT_NMSTRT
 473 #include "asciitab.h"
 474 #undef BT_COLON
 475 #include "utf8tab.h"
 476        },
 477        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
 478
 479 #ifdef XML_NS
 480
 481 static const struct normal_encoding internal_utf8_encoding_ns
 482     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
 483        {
 484 #  include "iasciitab.h"
 485 #  include "utf8tab.h"
 486        },
 487        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
 488
 489 #endif
 490
 491 static const struct normal_encoding internal_utf8_encoding
 492     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
 493        {
 494 #define BT_COLON BT_NMSTRT
 495 #include "iasciitab.h"
 496 #undef BT_COLON
 497 #include "utf8tab.h"
 498        },
 499        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
 500
 501 static enum XML_Convert_Result PTRCALL
 502 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
 503               char **toP, const char *toLim) {
 504   UNUSED_P(enc);
 505   for (;;) {
 506     unsigned char c;
 507     if (*fromP == fromLim)
 508       return XML_CONVERT_COMPLETED;
 509     c = (unsigned char)**fromP;
 510     if (c & 0x80) {
 511       if (toLim - *toP < 2)
 512         return XML_CONVERT_OUTPUT_EXHAUSTED;
 513       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
 514       *(*toP)++ = (char)((c & 0x3f) | 0x80);
 515       (*fromP)++;
 516     } else {
 517       if (*toP == toLim)
 518         return XML_CONVERT_OUTPUT_EXHAUSTED;
 519       *(*toP)++ = *(*fromP)++;
 520     }
 521   }
 522 }
 523
 524 static enum XML_Convert_Result PTRCALL
 525 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
 526                unsigned short **toP, const unsigned short *toLim) {
 527   UNUSED_P(enc);
 528   while (*fromP < fromLim && *toP < toLim)
 529     *(*toP)++ = (unsigned char)*(*fromP)++;
 530
 531   if ((*toP == toLim) && (*fromP < fromLim))
 532     return XML_CONVERT_OUTPUT_EXHAUSTED;
 533   else
 534     return XML_CONVERT_COMPLETED;
 535 }
 536
 537 #ifdef XML_NS
 538
 539 static const struct normal_encoding latin1_encoding_ns
 540     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
 541        {
 542 #  include "asciitab.h"
 543 #  include "latin1tab.h"
 544        },
 545        STANDARD_VTABLE(sb_) NULL_VTABLE};
 546
 547 #endif
 548
 549 static const struct normal_encoding latin1_encoding
 550     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
 551        {
 552 #define BT_COLON BT_NMSTRT
 553 #include "asciitab.h"
 554 #undef BT_COLON
 555 #include "latin1tab.h"
 556        },
 557        STANDARD_VTABLE(sb_) NULL_VTABLE};
 558
 559 static enum XML_Convert_Result PTRCALL
 560 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
 561              char **toP, const char *toLim) {
 562   UNUSED_P(enc);
 563   while (*fromP < fromLim && *toP < toLim)
 564     *(*toP)++ = *(*fromP)++;
 565
 566   if ((*toP == toLim) && (*fromP < fromLim))
 567     return XML_CONVERT_OUTPUT_EXHAUSTED;
 568   else
 569     return XML_CONVERT_COMPLETED;
 570 }
 571
 572 #ifdef XML_NS
 573
 574 static const struct normal_encoding ascii_encoding_ns
 575     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
 576        {
 577 #  include "asciitab.h"
 578            /* BT_NONXML == 0 */
 579        },
 580        STANDARD_VTABLE(sb_) NULL_VTABLE};
 581
 582 #endif
 583
 584 static const struct normal_encoding ascii_encoding
 585     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
 586        {
 587 #define BT_COLON BT_NMSTRT
 588 #include "asciitab.h"
 589 #undef BT_COLON
 590            /* BT_NONXML == 0 */
 591        },
 592        STANDARD_VTABLE(sb_) NULL_VTABLE};
 593
 594 static int PTRFASTCALL
 595 unicode_byte_type(char hi, char lo) {
 596   switch ((unsigned char)hi) {
 597   /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
 598   case 0xD8:
 599   case 0xD9:
 600   case 0xDA:
 601   case 0xDB:
 602     return BT_LEAD4;
 603   /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
 604   case 0xDC:
 605   case 0xDD:
 606   case 0xDE:
 607   case 0xDF:
 608     return BT_TRAIL;
 609   case 0xFF:
 610     switch ((unsigned char)lo) {
 611     case 0xFF: /* noncharacter-FFFF */
 612     case 0xFE: /* noncharacter-FFFE */
 613       return BT_NONXML;
 614     }
 615     break;
 616   }
 617   return BT_NONASCII;
 618 }
 619
 620 #define DEFINE_UTF16_TO_UTF8(E)                                                \
 621   static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
 622       const ENCODING *enc, const char **fromP, const char *fromLim,            \
 623       char **toP, const char *toLim) {                                         \
 624     const char *from = *fromP;                                                 \
 625     UNUSED_P(enc);                                                             \
 626     fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
 627     for (; from < fromLim; from += 2) {                                        \
 628       int plane;                                                               \
 629       unsigned char lo2;                                                       \
 630       unsigned char lo = GET_LO(from);                                         \
 631       unsigned char hi = GET_HI(from);                                         \
 632       switch (hi) {                                                            \
 633       case 0:                                                                  \
 634         if (lo < 0x80) {                                                       \
 635           if (*toP == toLim) {                                                 \
 636             *fromP = from;                                                     \
 637             return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
 638           }                                                                    \
 639           *(*toP)++ = lo;                                                      \
 640           break;                                                               \
 641         }                                                                      \
 642         /* fall through */                                                     \
 643       case 0x1:                                                                \
 644       case 0x2:                                                                \
 645       case 0x3:                                                                \
 646       case 0x4:                                                                \
 647       case 0x5:                                                                \
 648       case 0x6:                                                                \
 649       case 0x7:                                                                \
 650         if (toLim - *toP < 2) {                                                \
 651           *fromP = from;                                                       \
 652           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
 653         }                                                                      \
 654         *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
 655         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
 656         break;                                                                 \
 657       default:                                                                 \
 658         if (toLim - *toP < 3) {                                                \
 659           *fromP = from;                                                       \
 660           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
 661         }                                                                      \
 662         /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
 663         *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
 664         *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
 665         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
 666         break;                                                                 \
 667       case 0xD8:                                                               \
 668       case 0xD9:                                                               \
 669       case 0xDA:                                                               \
 670       case 0xDB:                                                               \
 671         if (toLim - *toP < 4) {                                                \
 672           *fromP = from;                                                       \
 673           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
 674         }                                                                      \
 675         if (fromLim - from < 4) {                                              \
 676           *fromP = from;                                                       \
 677           return XML_CONVERT_INPUT_INCOMPLETE;                                 \
 678         }                                                                      \
 679         plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
 680         *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
 681         *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
 682         from += 2;                                                             \
 683         lo2 = GET_LO(from);                                                    \
 684         *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
 685                      | (lo2 >> 6) | 0x80);                                     \
 686         *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
 687         break;                                                                 \
 688       }                                                                        \
 689     }                                                                          \
 690     *fromP = from;                                                             \
 691     if (from < fromLim)                                                        \
 692       return XML_CONVERT_INPUT_INCOMPLETE;                                     \
 693     else                                                                       \
 694       return XML_CONVERT_COMPLETED;                                            \
 695   }
 696
 697 #define DEFINE_UTF16_TO_UTF16(E)                                               \
 698   static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
 699       const ENCODING *enc, const char **fromP, const char *fromLim,            \
 700       unsigned short **toP, const unsigned short *toLim) {                     \
 701     enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
 702     UNUSED_P(enc);                                                             \
 703     fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
 704     /* Avoid copying first half only of surrogate */                           \
 705     if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
 706         && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
 707       fromLim -= 2;                                                            \
 708       res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
 709     }                                                                          \
 710     for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
 711       *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
 712     if ((*toP == toLim) && (*fromP < fromLim))                                 \
 713       return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
 714     else                                                                       \
 715       return res;                                                              \
 716   }
 717
 718 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
 719 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
 720 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
 721
 722 DEFINE_UTF16_TO_UTF8(little2_)
 723 DEFINE_UTF16_TO_UTF16(little2_)
 724
 725 #undef SET2
 726 #undef GET_LO
 727 #undef GET_HI
 728
 729 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
 730 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
 731 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
 732
 733 DEFINE_UTF16_TO_UTF8(big2_)
 734 DEFINE_UTF16_TO_UTF16(big2_)
 735
 736 #undef SET2
 737 #undef GET_LO
 738 #undef GET_HI
 739
 740 #define LITTLE2_BYTE_TYPE(enc, p)                                              \
 741   ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]  \
 742                : unicode_byte_type((p)[1], (p)[0]))
 743 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
 744 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
 745 #define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
 746   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
 747 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
 748   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
 749
 750 #ifdef XML_MIN_SIZE
 751
 752 static int PTRFASTCALL
 753 little2_byteType(const ENCODING *enc, const char *p) {
 754   return LITTLE2_BYTE_TYPE(enc, p);
 755 }
 756
 757 static int PTRFASTCALL
 758 little2_byteToAscii(const ENCODING *enc, const char *p) {
 759   UNUSED_P(enc);
 760   return LITTLE2_BYTE_TO_ASCII(p);
 761 }
 762
 763 static int PTRCALL
 764 little2_charMatches(const ENCODING *enc, const char *p, int c) {
 765   UNUSED_P(enc);
 766   return LITTLE2_CHAR_MATCHES(p, c);
 767 }
 768
 769 static int PTRFASTCALL
 770 little2_isNameMin(const ENCODING *enc, const char *p) {
 771   UNUSED_P(enc);
 772   return LITTLE2_IS_NAME_CHAR_MINBPC(p);
 773 }
 774
 775 static int PTRFASTCALL
 776 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
 777   UNUSED_P(enc);
 778   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
 779 }
 780
 781 #  undef VTABLE
 782 #  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
 783
 784 #else /* not XML_MIN_SIZE */
 785
 786 #  undef PREFIX
 787 #  define PREFIX(ident) little2_##ident
 788 #  define MINBPC(enc) 2
 789 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
 790 #  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
 791 #  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
 792 #  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
 793 #  define IS_NAME_CHAR(enc, p, n) 0
 794 #  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
 795 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
 796 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
 797
 798 #  define XML_TOK_IMPL_C
 799 #  include "xmltok_impl.c"
 800 #  undef XML_TOK_IMPL_C
 801
 802 #  undef MINBPC
 803 #  undef BYTE_TYPE
 804 #  undef BYTE_TO_ASCII
 805 #  undef CHAR_MATCHES
 806 #  undef IS_NAME_CHAR
 807 #  undef IS_NAME_CHAR_MINBPC
 808 #  undef IS_NMSTRT_CHAR
 809 #  undef IS_NMSTRT_CHAR_MINBPC
 810 #  undef IS_INVALID_CHAR
 811
 812 #endif /* not XML_MIN_SIZE */
 813
 814 #ifdef XML_NS
 815
 816 static const struct normal_encoding little2_encoding_ns
 817     = {{VTABLE, 2, 0,
 818 #  if BYTEORDER == 1234
 819         1
 820 #  else
 821         0
 822 #  endif
 823        },
 824        {
 825 #  include "asciitab.h"
 826 #  include "latin1tab.h"
 827        },
 828        STANDARD_VTABLE(little2_) NULL_VTABLE};
 829
 830 #endif
 831
 832 static const struct normal_encoding little2_encoding
 833     = {{VTABLE, 2, 0,
 834 #if BYTEORDER == 1234
 835         1
 836 #else
 837         0
 838 #endif
 839        },
 840        {
 841 #define BT_COLON BT_NMSTRT
 842 #include "asciitab.h"
 843 #undef BT_COLON
 844 #include "latin1tab.h"
 845        },
 846        STANDARD_VTABLE(little2_) NULL_VTABLE};
 847
 848 #if BYTEORDER != 4321
 849
 850 #  ifdef XML_NS
 851
 852 static const struct normal_encoding internal_little2_encoding_ns
 853     = {{VTABLE, 2, 0, 1},
 854        {
 855 #    include "iasciitab.h"
 856 #    include "latin1tab.h"
 857        },
 858        STANDARD_VTABLE(little2_) NULL_VTABLE};
 859
 860 #  endif
 861
 862 static const struct normal_encoding internal_little2_encoding
 863     = {{VTABLE, 2, 0, 1},
 864        {
 865 #  define BT_COLON BT_NMSTRT
 866 #  include "iasciitab.h"
 867 #  undef BT_COLON
 868 #  include "latin1tab.h"
 869        },
 870        STANDARD_VTABLE(little2_) NULL_VTABLE};
 871
 872 #endif
 873
 874 #define BIG2_BYTE_TYPE(enc, p)                                                 \
 875   ((p)[0] == 0                                                                 \
 876        ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]]        \
 877        : unicode_byte_type((p)[0], (p)[1]))
 878 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
 879 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
 880 #define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
 881   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
 882 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
 883   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
 884
 885 #ifdef XML_MIN_SIZE
 886
 887 static int PTRFASTCALL
 888 big2_byteType(const ENCODING *enc, const char *p) {
 889   return BIG2_BYTE_TYPE(enc, p);
 890 }
 891
 892 static int PTRFASTCALL
 893 big2_byteToAscii(const ENCODING *enc, const char *p) {
 894   UNUSED_P(enc);
 895   return BIG2_BYTE_TO_ASCII(p);
 896 }
 897
 898 static int PTRCALL
 899 big2_charMatches(const ENCODING *enc, const char *p, int c) {
 900   UNUSED_P(enc);
 901   return BIG2_CHAR_MATCHES(p, c);
 902 }
 903
 904 static int PTRFASTCALL
 905 big2_isNameMin(const ENCODING *enc, const char *p) {
 906   UNUSED_P(enc);
 907   return BIG2_IS_NAME_CHAR_MINBPC(p);
 908 }
 909
 910 static int PTRFASTCALL
 911 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
 912   UNUSED_P(enc);
 913   return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
 914 }
 915
 916 #  undef VTABLE
 917 #  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
 918
 919 #else /* not XML_MIN_SIZE */
 920
 921 #  undef PREFIX
 922 #  define PREFIX(ident) big2_##ident
 923 #  define MINBPC(enc) 2
 924 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
 925 #  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
 926 #  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
 927 #  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
 928 #  define IS_NAME_CHAR(enc, p, n) 0
 929 #  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
 930 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
 931 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
 932
 933 #  define XML_TOK_IMPL_C
 934 #  include "xmltok_impl.c"
 935 #  undef XML_TOK_IMPL_C
 936
 937 #  undef MINBPC
 938 #  undef BYTE_TYPE
 939 #  undef BYTE_TO_ASCII
 940 #  undef CHAR_MATCHES
 941 #  undef IS_NAME_CHAR
 942 #  undef IS_NAME_CHAR_MINBPC
 943 #  undef IS_NMSTRT_CHAR
 944 #  undef IS_NMSTRT_CHAR_MINBPC
 945 #  undef IS_INVALID_CHAR
 946
 947 #endif /* not XML_MIN_SIZE */
 948
 949 #ifdef XML_NS
 950
 951 static const struct normal_encoding big2_encoding_ns
 952     = {{VTABLE, 2, 0,
 953 #  if BYTEORDER == 4321
 954         1
 955 #  else
 956         0
 957 #  endif
 958        },
 959        {
 960 #  include "asciitab.h"
 961 #  include "latin1tab.h"
 962        },
 963        STANDARD_VTABLE(big2_) NULL_VTABLE};
 964
 965 #endif
 966
 967 static const struct normal_encoding big2_encoding
 968     = {{VTABLE, 2, 0,
 969 #if BYTEORDER == 4321
 970         1
 971 #else
 972         0
 973 #endif
 974        },
 975        {
 976 #define BT_COLON BT_NMSTRT
 977 #include "asciitab.h"
 978 #undef BT_COLON
 979 #include "latin1tab.h"
 980        },
 981        STANDARD_VTABLE(big2_) NULL_VTABLE};
 982
 983 #if BYTEORDER != 1234
 984
 985 #  ifdef XML_NS
 986
 987 static const struct normal_encoding internal_big2_encoding_ns
 988     = {{VTABLE, 2, 0, 1},
 989        {
 990 #    include "iasciitab.h"
 991 #    include "latin1tab.h"
 992        },
 993        STANDARD_VTABLE(big2_) NULL_VTABLE};
 994
 995 #  endif
 996
 997 static const struct normal_encoding internal_big2_encoding
 998     = {{VTABLE, 2, 0, 1},
 999        {
1000 #  define BT_COLON BT_NMSTRT
1001 #  include "iasciitab.h"
1002 #  undef BT_COLON
1003 #  include "latin1tab.h"
1004        },
1005        STANDARD_VTABLE(big2_) NULL_VTABLE};
1006
1007 #endif
1008
1009 #undef PREFIX
1010
1011 static int FASTCALL
1012 streqci(const char *s1, const char *s2) {
1013   for (;;) {
1014     char c1 = *s1++;
1015     char c2 = *s2++;
1016     if (ASCII_a <= c1 && c1 <= ASCII_z)
1017       c1 += ASCII_A - ASCII_a;
1018     if (ASCII_a <= c2 && c2 <= ASCII_z)
1019       /* The following line will never get executed.  streqci() is
1020        * only called from two places, both of which guarantee to put
1021        * upper-case strings into s2.
1022        */
1023       c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1024     if (c1 != c2)
1025       return 0;
1026     if (! c1)
1027       break;
1028   }
1029   return 1;
1030 }
1031
1032 static void PTRCALL
1033 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1034                    POSITION *pos) {
1035   UNUSED_P(enc);
1036   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1037 }
1038
1039 static int
1040 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1041   char buf[1];
1042   char *p = buf;
1043   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1044   if (p == buf)
1045     return -1;
1046   else
1047     return buf[0];
1048 }
1049
1050 static int FASTCALL
1051 isSpace(int c) {
1052   switch (c) {
1053   case 0x20:
1054   case 0xD:
1055   case 0xA:
1056   case 0x9:
1057     return 1;
1058   }
1059   return 0;
1060 }
1061
1062 /* Return 1 if there's just optional white space or there's an S
1063    followed by name=val.
1064 */
1065 static int
1066 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1067                      const char **namePtr, const char **nameEndPtr,
1068                      const char **valPtr, const char **nextTokPtr) {
1069   int c;
1070   char open;
1071   if (ptr == end) {
1072     *namePtr = NULL;
1073     return 1;
1074   }
1075   if (! isSpace(toAscii(enc, ptr, end))) {
1076     *nextTokPtr = ptr;
1077     return 0;
1078   }
1079   do {
1080     ptr += enc->minBytesPerChar;
1081   } while (isSpace(toAscii(enc, ptr, end)));
1082   if (ptr == end) {
1083     *namePtr = NULL;
1084     return 1;
1085   }
1086   *namePtr = ptr;
1087   for (;;) {
1088     c = toAscii(enc, ptr, end);
1089     if (c == -1) {
1090       *nextTokPtr = ptr;
1091       return 0;
1092     }
1093     if (c == ASCII_EQUALS) {
1094       *nameEndPtr = ptr;
1095       break;
1096     }
1097     if (isSpace(c)) {
1098       *nameEndPtr = ptr;
1099       do {
1100         ptr += enc->minBytesPerChar;
1101       } while (isSpace(c = toAscii(enc, ptr, end)));
1102       if (c != ASCII_EQUALS) {
1103         *nextTokPtr = ptr;
1104         return 0;
1105       }
1106       break;
1107     }
1108     ptr += enc->minBytesPerChar;
1109   }
1110   if (ptr == *namePtr) {
1111     *nextTokPtr = ptr;
1112     return 0;
1113   }
1114   ptr += enc->minBytesPerChar;
1115   c = toAscii(enc, ptr, end);
1116   while (isSpace(c)) {
1117     ptr += enc->minBytesPerChar;
1118     c = toAscii(enc, ptr, end);
1119   }
1120   if (c != ASCII_QUOT && c != ASCII_APOS) {
1121     *nextTokPtr = ptr;
1122     return 0;
1123   }
1124   open = (char)c;
1125   ptr += enc->minBytesPerChar;
1126   *valPtr = ptr;
1127   for (;; ptr += enc->minBytesPerChar) {
1128     c = toAscii(enc, ptr, end);
1129     if (c == open)
1130       break;
1131     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1132         && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1133         && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1134       *nextTokPtr = ptr;
1135       return 0;
1136     }
1137   }
1138   *nextTokPtr = ptr + enc->minBytesPerChar;
1139   return 1;
1140 }
1141
1142 static const char KW_version[]
1143     = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1144
1145 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1146                                    ASCII_i, ASCII_n, ASCII_g, '\0'};
1147
1148 static const char KW_standalone[]
1149     = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1150        ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1151
1152 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1153
1154 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1155
1156 static int
1157 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1158                                                  const char *),
1159                int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1160                const char *end, const char **badPtr, const char **versionPtr,
1161                const char **versionEndPtr, const char **encodingName,
1162                const ENCODING **encoding, int *standalone) {
1163   const char *val = NULL;
1164   const char *name = NULL;
1165   const char *nameEnd = NULL;
1166   ptr += 5 * enc->minBytesPerChar;
1167   end -= 2 * enc->minBytesPerChar;
1168   if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1169       || ! name) {
1170     *badPtr = ptr;
1171     return 0;
1172   }
1173   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1174     if (! isGeneralTextEntity) {
1175       *badPtr = name;
1176       return 0;
1177     }
1178   } else {
1179     if (versionPtr)
1180       *versionPtr = val;
1181     if (versionEndPtr)
1182       *versionEndPtr = ptr;
1183     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1184       *badPtr = ptr;
1185       return 0;
1186     }
1187     if (! name) {
1188       if (isGeneralTextEntity) {
1189         /* a TextDecl must have an EncodingDecl */
1190         *badPtr = ptr;
1191         return 0;
1192       }
1193       return 1;
1194     }
1195   }
1196   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1197     int c = toAscii(enc, val, end);
1198     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1199       *badPtr = val;
1200       return 0;
1201     }
1202     if (encodingName)
1203       *encodingName = val;
1204     if (encoding)
1205       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1206     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1207       *badPtr = ptr;
1208       return 0;
1209     }
1210     if (! name)
1211       return 1;
1212   }
1213   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1214       || isGeneralTextEntity) {
1215     *badPtr = name;
1216     return 0;
1217   }
1218   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1219     if (standalone)
1220       *standalone = 1;
1221   } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1222     if (standalone)
1223       *standalone = 0;
1224   } else {
1225     *badPtr = val;
1226     return 0;
1227   }
1228   while (isSpace(toAscii(enc, ptr, end)))
1229     ptr += enc->minBytesPerChar;
1230   if (ptr != end) {
1231     *badPtr = ptr;
1232     return 0;
1233   }
1234   return 1;
1235 }
1236
1237 static int FASTCALL
1238 checkCharRefNumber(int result) {
1239   switch (result >> 8) {
1240   case 0xD8:
1241   case 0xD9:
1242   case 0xDA:
1243   case 0xDB:
1244   case 0xDC:
1245   case 0xDD:
1246   case 0xDE:
1247   case 0xDF:
1248     return -1;
1249   case 0:
1250     if (latin1_encoding.type[result] == BT_NONXML)
1251       return -1;
1252     break;
1253   case 0xFF:
1254     if (result == 0xFFFE || result == 0xFFFF)
1255       return -1;
1256     break;
1257   }
1258   return result;
1259 }
1260
1261 int FASTCALL
1262 XmlUtf8Encode(int c, char *buf) {
1263   enum {
1264     /* minN is minimum legal resulting value for N byte sequence */
1265     min2 = 0x80,
1266     min3 = 0x800,
1267     min4 = 0x10000
1268   };
1269
1270   if (c < 0)
1271     return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1272   if (c < min2) {
1273     buf[0] = (char)(c | UTF8_cval1);
1274     return 1;
1275   }
1276   if (c < min3) {
1277     buf[0] = (char)((c >> 6) | UTF8_cval2);
1278     buf[1] = (char)((c & 0x3f) | 0x80);
1279     return 2;
1280   }
1281   if (c < min4) {
1282     buf[0] = (char)((c >> 12) | UTF8_cval3);
1283     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1284     buf[2] = (char)((c & 0x3f) | 0x80);
1285     return 3;
1286   }
1287   if (c < 0x110000) {
1288     buf[0] = (char)((c >> 18) | UTF8_cval4);
1289     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1290     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1291     buf[3] = (char)((c & 0x3f) | 0x80);
1292     return 4;
1293   }
1294   return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1295 }
1296
1297 int FASTCALL
1298 XmlUtf16Encode(int charNum, unsigned short *buf) {
1299   if (charNum < 0)
1300     return 0;
1301   if (charNum < 0x10000) {
1302     buf[0] = (unsigned short)charNum;
1303     return 1;
1304   }
1305   if (charNum < 0x110000) {
1306     charNum -= 0x10000;
1307     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1308     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1309     return 2;
1310   }
1311   return 0;
1312 }
1313
1314 struct unknown_encoding {
1315   struct normal_encoding normal;
1316   CONVERTER convert;
1317   void *userData;
1318   unsigned short utf16[256];
1319   char utf8[256][4];
1320 };
1321
1322 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1323
1324 int
1325 XmlSizeOfUnknownEncoding(void) {
1326   return sizeof(struct unknown_encoding);
1327 }
1328
1329 static int PTRFASTCALL
1330 unknown_isName(const ENCODING *enc, const char *p) {
1331   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1332   int c = uenc->convert(uenc->userData, p);
1333   if (c & ~0xFFFF)
1334     return 0;
1335   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1336 }
1337
1338 static int PTRFASTCALL
1339 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1340   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1341   int c = uenc->convert(uenc->userData, p);
1342   if (c & ~0xFFFF)
1343     return 0;
1344   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1345 }
1346
1347 static int PTRFASTCALL
1348 unknown_isInvalid(const ENCODING *enc, const char *p) {
1349   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1350   int c = uenc->convert(uenc->userData, p);
1351   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1352 }
1353
1354 static enum XML_Convert_Result PTRCALL
1355 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1356                char **toP, const char *toLim) {
1357   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1358   char buf[XML_UTF8_ENCODE_MAX];
1359   for (;;) {
1360     const char *utf8;
1361     int n;
1362     if (*fromP == fromLim)
1363       return XML_CONVERT_COMPLETED;
1364     utf8 = uenc->utf8[(unsigned char)**fromP];
1365     n = *utf8++;
1366     if (n == 0) {
1367       int c = uenc->convert(uenc->userData, *fromP);
1368       n = XmlUtf8Encode(c, buf);
1369       if (n > toLim - *toP)
1370         return XML_CONVERT_OUTPUT_EXHAUSTED;
1371       utf8 = buf;
1372       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1373                  - (BT_LEAD2 - 2));
1374     } else {
1375       if (n > toLim - *toP)
1376         return XML_CONVERT_OUTPUT_EXHAUSTED;
1377       (*fromP)++;
1378     }
1379     memcpy(*toP, utf8, n);
1380     *toP += n;
1381   }
1382 }
1383
1384 static enum XML_Convert_Result PTRCALL
1385 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1386                 unsigned short **toP, const unsigned short *toLim) {
1387   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1388   while (*fromP < fromLim && *toP < toLim) {
1389     unsigned short c = uenc->utf16[(unsigned char)**fromP];
1390     if (c == 0) {
1391       c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1392       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1393                  - (BT_LEAD2 - 2));
1394     } else
1395       (*fromP)++;
1396     *(*toP)++ = c;
1397   }
1398
1399   if ((*toP == toLim) && (*fromP < fromLim))
1400     return XML_CONVERT_OUTPUT_EXHAUSTED;
1401   else
1402     return XML_CONVERT_COMPLETED;
1403 }
1404
1405 ENCODING *
1406 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1407                        void *userData) {
1408   int i;
1409   struct unknown_encoding *e = (struct unknown_encoding *)mem;
1410   memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1411   for (i = 0; i < 128; i++)
1412     if (latin1_encoding.type[i] != BT_OTHER
1413         && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1414       return 0;
1415   for (i = 0; i < 256; i++) {
1416     int c = table[i];
1417     if (c == -1) {
1418       e->normal.type[i] = BT_MALFORM;
1419       /* This shouldn't really get used. */
1420       e->utf16[i] = 0xFFFF;
1421       e->utf8[i][0] = 1;
1422       e->utf8[i][1] = 0;
1423     } else if (c < 0) {
1424       if (c < -4)
1425         return 0;
1426       /* Multi-byte sequences need a converter function */
1427       if (! convert)
1428         return 0;
1429       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1430       e->utf8[i][0] = 0;
1431       e->utf16[i] = 0;
1432     } else if (c < 0x80) {
1433       if (latin1_encoding.type[c] != BT_OTHER
1434           && latin1_encoding.type[c] != BT_NONXML && c != i)
1435         return 0;
1436       e->normal.type[i] = latin1_encoding.type[c];
1437       e->utf8[i][0] = 1;
1438       e->utf8[i][1] = (char)c;
1439       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1440     } else if (checkCharRefNumber(c) < 0) {
1441       e->normal.type[i] = BT_NONXML;
1442       /* This shouldn't really get used. */
1443       e->utf16[i] = 0xFFFF;
1444       e->utf8[i][0] = 1;
1445       e->utf8[i][1] = 0;
1446     } else {
1447       if (c > 0xFFFF)
1448         return 0;
1449       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1450         e->normal.type[i] = BT_NMSTRT;
1451       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1452         e->normal.type[i] = BT_NAME;
1453       else
1454         e->normal.type[i] = BT_OTHER;
1455       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1456       e->utf16[i] = (unsigned short)c;
1457     }
1458   }
1459   e->userData = userData;
1460   e->convert = convert;
1461   if (convert) {
1462     e->normal.isName2 = unknown_isName;
1463     e->normal.isName3 = unknown_isName;
1464     e->normal.isName4 = unknown_isName;
1465     e->normal.isNmstrt2 = unknown_isNmstrt;
1466     e->normal.isNmstrt3 = unknown_isNmstrt;
1467     e->normal.isNmstrt4 = unknown_isNmstrt;
1468     e->normal.isInvalid2 = unknown_isInvalid;
1469     e->normal.isInvalid3 = unknown_isInvalid;
1470     e->normal.isInvalid4 = unknown_isInvalid;
1471   }
1472   e->normal.enc.utf8Convert = unknown_toUtf8;
1473   e->normal.enc.utf16Convert = unknown_toUtf16;
1474   return &(e->normal.enc);
1475 }
1476
1477 /* If this enumeration is changed, getEncodingIndex and encodings
1478 must also be changed. */
1479 enum {
1480   UNKNOWN_ENC = -1,
1481   ISO_8859_1_ENC = 0,
1482   US_ASCII_ENC,
1483   UTF_8_ENC,
1484   UTF_16_ENC,
1485   UTF_16BE_ENC,
1486   UTF_16LE_ENC,
1487   /* must match encodingNames up to here */
1488   NO_ENC
1489 };
1490
1491 static const char KW_ISO_8859_1[]
1492     = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1493        ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1494 static const char KW_US_ASCII[]
1495     = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1496        ASCII_C, ASCII_I, ASCII_I,     '\0'};
1497 static const char KW_UTF_8[]
1498     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1499 static const char KW_UTF_16[]
1500     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1501 static const char KW_UTF_16BE[]
1502     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1503        ASCII_6, ASCII_B, ASCII_E, '\0'};
1504 static const char KW_UTF_16LE[]
1505     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1506        ASCII_6, ASCII_L, ASCII_E, '\0'};
1507
1508 static int FASTCALL
1509 getEncodingIndex(const char *name) {
1510   static const char *const encodingNames[] = {
1511       KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1512   };
1513   int i;
1514   if (name == NULL)
1515     return NO_ENC;
1516   for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1517     if (streqci(name, encodingNames[i]))
1518       return i;
1519   return UNKNOWN_ENC;
1520 }
1521
1522 /* For binary compatibility, we store the index of the encoding
1523    specified at initialization in the isUtf16 member.
1524 */
1525
1526 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1527 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1528
1529 /* This is what detects the encoding.  encodingTable maps from
1530    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1531    the external (protocol) specified encoding; state is
1532    XML_CONTENT_STATE if we're parsing an external text entity, and
1533    XML_PROLOG_STATE otherwise.
1534 */
1535
1536 static int
1537 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1538          int state, const char *ptr, const char *end, const char **nextTokPtr) {
1539   const ENCODING **encPtr;
1540
1541   if (ptr >= end)
1542     return XML_TOK_NONE;
1543   encPtr = enc->encPtr;
1544   if (ptr + 1 == end) {
1545     /* only a single byte available for auto-detection */
1546 #ifndef XML_DTD /* FIXME */
1547     /* a well-formed document entity must have more than one byte */
1548     if (state != XML_CONTENT_STATE)
1549       return XML_TOK_PARTIAL;
1550 #endif
1551     /* so we're parsing an external text entity... */
1552     /* if UTF-16 was externally specified, then we need at least 2 bytes */
1553     switch (INIT_ENC_INDEX(enc)) {
1554     case UTF_16_ENC:
1555     case UTF_16LE_ENC:
1556     case UTF_16BE_ENC:
1557       return XML_TOK_PARTIAL;
1558     }
1559     switch ((unsigned char)*ptr) {
1560     case 0xFE:
1561     case 0xFF:
1562     case 0xEF: /* possibly first byte of UTF-8 BOM */
1563       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1564         break;
1565       /* fall through */
1566     case 0x00:
1567     case 0x3C:
1568       return XML_TOK_PARTIAL;
1569     }
1570   } else {
1571     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1572     case 0xFEFF:
1573       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1574         break;
1575       *nextTokPtr = ptr + 2;
1576       *encPtr = encodingTable[UTF_16BE_ENC];
1577       return XML_TOK_BOM;
1578     /* 00 3C is handled in the default case */
1579     case 0x3C00:
1580       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1581            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1582           && state == XML_CONTENT_STATE)
1583         break;
1584       *encPtr = encodingTable[UTF_16LE_ENC];
1585       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1586     case 0xFFFE:
1587       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1588         break;
1589       *nextTokPtr = ptr + 2;
1590       *encPtr = encodingTable[UTF_16LE_ENC];
1591       return XML_TOK_BOM;
1592     case 0xEFBB:
1593       /* Maybe a UTF-8 BOM (EF BB BF) */
1594       /* If there's an explicitly specified (external) encoding
1595          of ISO-8859-1 or some flavour of UTF-16
1596          and this is an external text entity,
1597          don't look for the BOM,
1598          because it might be a legal data.
1599       */
1600       if (state == XML_CONTENT_STATE) {
1601         int e = INIT_ENC_INDEX(enc);
1602         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1603             || e == UTF_16_ENC)
1604           break;
1605       }
1606       if (ptr + 2 == end)
1607         return XML_TOK_PARTIAL;
1608       if ((unsigned char)ptr[2] == 0xBF) {
1609         *nextTokPtr = ptr + 3;
1610         *encPtr = encodingTable[UTF_8_ENC];
1611         return XML_TOK_BOM;
1612       }
1613       break;
1614     default:
1615       if (ptr[0] == '\0') {
1616         /* 0 isn't a legal data character. Furthermore a document
1617            entity can only start with ASCII characters.  So the only
1618            way this can fail to be big-endian UTF-16 if it it's an
1619            external parsed general entity that's labelled as
1620            UTF-16LE.
1621         */
1622         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1623           break;
1624         *encPtr = encodingTable[UTF_16BE_ENC];
1625         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1626       } else if (ptr[1] == '\0') {
1627         /* We could recover here in the case:
1628             - parsing an external entity
1629             - second byte is 0
1630             - no externally specified encoding
1631             - no encoding declaration
1632            by assuming UTF-16LE.  But we don't, because this would mean when
1633            presented just with a single byte, we couldn't reliably determine
1634            whether we needed further bytes.
1635         */
1636         if (state == XML_CONTENT_STATE)
1637           break;
1638         *encPtr = encodingTable[UTF_16LE_ENC];
1639         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1640       }
1641       break;
1642     }
1643   }
1644   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1645   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1646 }
1647
1648 #define NS(x) x
1649 #define ns(x) x
1650 #define XML_TOK_NS_C
1651 #include "xmltok_ns.c"
1652 #undef XML_TOK_NS_C
1653 #undef NS
1654 #undef ns
1655
1656 #ifdef XML_NS
1657
1658 #  define NS(x) x##NS
1659 #  define ns(x) x##_ns
1660
1661 #  define XML_TOK_NS_C
1662 #  include "xmltok_ns.c"
1663 #  undef XML_TOK_NS_C
1664
1665 #  undef NS
1666 #  undef ns
1667
1668 ENCODING *
1669 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1670                          void *userData) {
1671   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1672   if (enc)
1673     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1674   return enc;
1675 }
1676
1677 #endif /* XML_NS */