4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net>
11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12 Copyright (c) 2002 Greg Stein <gstein@users.sourceforge.net>
13 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14 Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15 Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
16 Copyright (c) 2016 Pascal Cuoq <cuoq@trust-in-soft.com>
17 Copyright (c) 2016 Don Lewis <truckman@apache.org>
18 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
19 Copyright (c) 2017 Alexander Bluhm <alexander.bluhm@gmx.net>
20 Copyright (c) 2017 Benbuck Nason <bnason@netflix.com>
21 Copyright (c) 2017 José Gutiérrez de la Concha <jose@zeroc.com>
22 Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
23 Copyright (c) 2021 Dong-hee Na <donghee.na@python.org>
24 Copyright (c) 2022 Martin Ettl <ettl.martin78@googlemail.com>
25 Licensed under the MIT license:
27 Permission is hereby granted, free of charge, to any person obtaining
28 a copy of this software and associated documentation files (the
29 "Software"), to deal in the Software without restriction, including
30 without limitation the rights to use, copy, modify, merge, publish,
31 distribute, sublicense, and/or sell copies of the Software, and to permit
32 persons to whom the Software is furnished to do so, subject to the
35 The above copyright notice and this permission notice shall be included
36 in all copies or substantial portions of the Software.
38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
40 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
41 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
42 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
43 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
44 USE OR OTHER DEALINGS IN THE SOFTWARE.
47 #include <expat_config.h>
50 #include <string.h> /* memcpy */
54 # include "winconfig.h"
57 #include "expat_external.h"
63 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
65 # define IGNORE_SECTION_TOK_VTABLE /* as nothing */
69 {PREFIX(prologTok), PREFIX(contentTok), \
70 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
71 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
72 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
73 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
74 PREFIX(updatePosition), PREFIX(isPublicId)
76 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
78 #define UCS2_GET_NAMING(pages, hi, lo) \
79 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
81 /* A 2 byte UTF-8 representation splits the characters 11 bits between
82 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
83 pages, 3 bits to add to that index and 5 bits to generate the mask.
85 #define UTF8_GET_NAMING2(pages, byte) \
86 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
87 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
88 & (1u << (((byte)[1]) & 0x1F)))
90 /* A 3 byte UTF-8 representation splits the characters 16 bits between
91 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
92 into pages, 3 bits to add to that index and 5 bits to generate the
95 #define UTF8_GET_NAMING3(pages, byte) \
97 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
99 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
100 & (1u << (((byte)[2]) & 0x1F)))
102 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
103 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
104 with the additional restriction of not allowing the Unicode
105 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
106 Implementation details:
107 (A & 0x80) == 0 means A < 0x80
109 (A & 0xC0) == 0xC0 means A > 0xBF
112 #define UTF8_INVALID2(p) \
113 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
115 #define UTF8_INVALID3(p) \
116 (((p)[2] & 0x80) == 0 \
117 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
118 : ((p)[2] & 0xC0) == 0xC0) \
120 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
121 : ((p)[1] & 0x80) == 0 \
122 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
124 #define UTF8_INVALID4(p) \
125 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \
126 || ((p)[2] & 0xC0) == 0xC0 \
128 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
129 : ((p)[1] & 0x80) == 0 \
130 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
132 static int PTRFASTCALL
133 isNever(const ENCODING *enc, const char *p) {
139 static int PTRFASTCALL
140 utf8_isName2(const ENCODING *enc, const char *p) {
142 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
145 static int PTRFASTCALL
146 utf8_isName3(const ENCODING *enc, const char *p) {
148 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
151 #define utf8_isName4 isNever
153 static int PTRFASTCALL
154 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
156 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
159 static int PTRFASTCALL
160 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
162 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
165 #define utf8_isNmstrt4 isNever
167 static int PTRFASTCALL
168 utf8_isInvalid2(const ENCODING *enc, const char *p) {
170 return UTF8_INVALID2((const unsigned char *)p);
173 static int PTRFASTCALL
174 utf8_isInvalid3(const ENCODING *enc, const char *p) {
176 return UTF8_INVALID3((const unsigned char *)p);
179 static int PTRFASTCALL
180 utf8_isInvalid4(const ENCODING *enc, const char *p) {
182 return UTF8_INVALID4((const unsigned char *)p);
185 struct normal_encoding {
187 unsigned char type[256];
189 int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
190 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
191 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
192 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
193 int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
194 #endif /* XML_MIN_SIZE */
195 int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
196 int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
197 int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
198 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
199 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
200 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
201 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
202 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
203 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
206 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
210 # define STANDARD_VTABLE(E) \
211 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
215 # define STANDARD_VTABLE(E) /* as nothing */
219 #define NORMAL_VTABLE(E) \
220 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
221 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
223 #define NULL_VTABLE \
224 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \
225 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \
226 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
228 static int FASTCALL checkCharRefNumber(int);
230 #include "xmltok_impl.h"
234 # define sb_isNameMin isNever
235 # define sb_isNmstrtMin isNever
239 # define MINBPC(enc) ((enc)->minBytesPerChar)
241 /* minimum bytes per character */
242 # define MINBPC(enc) 1
245 #define SB_BYTE_TYPE(enc, p) \
246 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
249 static int PTRFASTCALL
250 sb_byteType(const ENCODING *enc, const char *p) {
251 return SB_BYTE_TYPE(enc, p);
253 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
255 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
259 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
260 static int PTRFASTCALL
261 sb_byteToAscii(const ENCODING *enc, const char *p) {
266 # define BYTE_TO_ASCII(enc, p) (*(p))
269 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
270 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
272 # define IS_INVALID_CHAR(enc, p, n) \
273 (AS_NORMAL_ENCODING(enc)->isInvalid##n \
274 && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
276 # define IS_INVALID_CHAR(enc, p, n) \
277 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
281 # define IS_NAME_CHAR_MINBPC(enc, p) \
282 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
283 # define IS_NMSTRT_CHAR_MINBPC(enc, p) \
284 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
286 # define IS_NAME_CHAR_MINBPC(enc, p) (0)
287 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
291 # define CHAR_MATCHES(enc, p, c) \
292 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
294 sb_charMatches(const ENCODING *enc, const char *p, int c) {
299 /* c is an ASCII character */
300 # define CHAR_MATCHES(enc, p, c) (*(p) == (c))
303 #define PREFIX(ident) normal_##ident
304 #define XML_TOK_IMPL_C
305 #include "xmltok_impl.c"
306 #undef XML_TOK_IMPL_C
313 #undef IS_NAME_CHAR_MINBPC
314 #undef IS_NMSTRT_CHAR
315 #undef IS_NMSTRT_CHAR_MINBPC
316 #undef IS_INVALID_CHAR
318 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
326 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
327 const char **fromLimRef) {
328 const char *fromLim = *fromLimRef;
330 for (; fromLim > from; fromLim--, walked++) {
331 const unsigned char prev = (unsigned char)fromLim[-1];
333 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
334 if (walked + 1 >= 4) {
340 } else if ((prev & 0xf0u)
341 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
342 if (walked + 1 >= 3) {
348 } else if ((prev & 0xe0u)
349 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
350 if (walked + 1 >= 2) {
356 } else if ((prev & 0x80u)
357 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
361 *fromLimRef = fromLim;
364 static enum XML_Convert_Result PTRCALL
365 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
366 char **toP, const char *toLim) {
367 bool input_incomplete = false;
368 bool output_exhausted = false;
370 /* Avoid copying partial characters (due to limited space). */
371 const ptrdiff_t bytesAvailable = fromLim - *fromP;
372 const ptrdiff_t bytesStorable = toLim - *toP;
374 if (bytesAvailable > bytesStorable) {
375 fromLim = *fromP + bytesStorable;
376 output_exhausted = true;
379 /* Avoid copying partial characters (from incomplete input). */
381 const char *const fromLimBefore = fromLim;
382 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
383 if (fromLim < fromLimBefore) {
384 input_incomplete = true;
389 const ptrdiff_t bytesToCopy = fromLim - *fromP;
390 memcpy(*toP, *fromP, bytesToCopy);
391 *fromP += bytesToCopy;
395 if (output_exhausted) /* needs to go first */
396 return XML_CONVERT_OUTPUT_EXHAUSTED;
397 else if (input_incomplete)
398 return XML_CONVERT_INPUT_INCOMPLETE;
400 return XML_CONVERT_COMPLETED;
403 static enum XML_Convert_Result PTRCALL
404 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
405 unsigned short **toP, const unsigned short *toLim) {
406 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
407 unsigned short *to = *toP;
408 const char *from = *fromP;
409 while (from < fromLim && to < toLim) {
410 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
412 if (fromLim - from < 2) {
413 res = XML_CONVERT_INPUT_INCOMPLETE;
416 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
420 if (fromLim - from < 3) {
421 res = XML_CONVERT_INPUT_INCOMPLETE;
424 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
430 if (toLim - to < 2) {
431 res = XML_CONVERT_OUTPUT_EXHAUSTED;
434 if (fromLim - from < 4) {
435 res = XML_CONVERT_INPUT_INCOMPLETE;
438 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
439 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
441 to[0] = (unsigned short)((n >> 10) | 0xD800);
442 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
452 res = XML_CONVERT_OUTPUT_EXHAUSTED;
460 static const struct normal_encoding utf8_encoding_ns
461 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
463 # include "asciitab.h"
464 # include "utf8tab.h"
466 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
469 static const struct normal_encoding utf8_encoding
470 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
472 #define BT_COLON BT_NMSTRT
473 #include "asciitab.h"
477 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
481 static const struct normal_encoding internal_utf8_encoding_ns
482 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
484 # include "iasciitab.h"
485 # include "utf8tab.h"
487 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
491 static const struct normal_encoding internal_utf8_encoding
492 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
494 #define BT_COLON BT_NMSTRT
495 #include "iasciitab.h"
499 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
501 static enum XML_Convert_Result PTRCALL
502 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
503 char **toP, const char *toLim) {
507 if (*fromP == fromLim)
508 return XML_CONVERT_COMPLETED;
509 c = (unsigned char)**fromP;
511 if (toLim - *toP < 2)
512 return XML_CONVERT_OUTPUT_EXHAUSTED;
513 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
514 *(*toP)++ = (char)((c & 0x3f) | 0x80);
518 return XML_CONVERT_OUTPUT_EXHAUSTED;
519 *(*toP)++ = *(*fromP)++;
524 static enum XML_Convert_Result PTRCALL
525 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
526 unsigned short **toP, const unsigned short *toLim) {
528 while (*fromP < fromLim && *toP < toLim)
529 *(*toP)++ = (unsigned char)*(*fromP)++;
531 if ((*toP == toLim) && (*fromP < fromLim))
532 return XML_CONVERT_OUTPUT_EXHAUSTED;
534 return XML_CONVERT_COMPLETED;
539 static const struct normal_encoding latin1_encoding_ns
540 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
542 # include "asciitab.h"
543 # include "latin1tab.h"
545 STANDARD_VTABLE(sb_) NULL_VTABLE};
549 static const struct normal_encoding latin1_encoding
550 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
552 #define BT_COLON BT_NMSTRT
553 #include "asciitab.h"
555 #include "latin1tab.h"
557 STANDARD_VTABLE(sb_) NULL_VTABLE};
559 static enum XML_Convert_Result PTRCALL
560 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
561 char **toP, const char *toLim) {
563 while (*fromP < fromLim && *toP < toLim)
564 *(*toP)++ = *(*fromP)++;
566 if ((*toP == toLim) && (*fromP < fromLim))
567 return XML_CONVERT_OUTPUT_EXHAUSTED;
569 return XML_CONVERT_COMPLETED;
574 static const struct normal_encoding ascii_encoding_ns
575 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
577 # include "asciitab.h"
580 STANDARD_VTABLE(sb_) NULL_VTABLE};
584 static const struct normal_encoding ascii_encoding
585 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
587 #define BT_COLON BT_NMSTRT
588 #include "asciitab.h"
592 STANDARD_VTABLE(sb_) NULL_VTABLE};
594 static int PTRFASTCALL
595 unicode_byte_type(char hi, char lo) {
596 switch ((unsigned char)hi) {
597 /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
603 /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
610 switch ((unsigned char)lo) {
611 case 0xFF: /* noncharacter-FFFF */
612 case 0xFE: /* noncharacter-FFFE */
620 #define DEFINE_UTF16_TO_UTF8(E) \
621 static enum XML_Convert_Result PTRCALL E##toUtf8( \
622 const ENCODING *enc, const char **fromP, const char *fromLim, \
623 char **toP, const char *toLim) { \
624 const char *from = *fromP; \
626 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
627 for (; from < fromLim; from += 2) { \
630 unsigned char lo = GET_LO(from); \
631 unsigned char hi = GET_HI(from); \
635 if (*toP == toLim) { \
637 return XML_CONVERT_OUTPUT_EXHAUSTED; \
650 if (toLim - *toP < 2) { \
652 return XML_CONVERT_OUTPUT_EXHAUSTED; \
654 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
655 *(*toP)++ = ((lo & 0x3f) | 0x80); \
658 if (toLim - *toP < 3) { \
660 return XML_CONVERT_OUTPUT_EXHAUSTED; \
662 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
663 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
664 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
665 *(*toP)++ = ((lo & 0x3f) | 0x80); \
671 if (toLim - *toP < 4) { \
673 return XML_CONVERT_OUTPUT_EXHAUSTED; \
675 if (fromLim - from < 4) { \
677 return XML_CONVERT_INPUT_INCOMPLETE; \
679 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
680 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \
681 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
683 lo2 = GET_LO(from); \
684 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \
685 | (lo2 >> 6) | 0x80); \
686 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
691 if (from < fromLim) \
692 return XML_CONVERT_INPUT_INCOMPLETE; \
694 return XML_CONVERT_COMPLETED; \
697 #define DEFINE_UTF16_TO_UTF16(E) \
698 static enum XML_Convert_Result PTRCALL E##toUtf16( \
699 const ENCODING *enc, const char **fromP, const char *fromLim, \
700 unsigned short **toP, const unsigned short *toLim) { \
701 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
703 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
704 /* Avoid copying first half only of surrogate */ \
705 if (fromLim - *fromP > ((toLim - *toP) << 1) \
706 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
708 res = XML_CONVERT_INPUT_INCOMPLETE; \
710 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
711 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
712 if ((*toP == toLim) && (*fromP < fromLim)) \
713 return XML_CONVERT_OUTPUT_EXHAUSTED; \
718 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
719 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
720 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
722 DEFINE_UTF16_TO_UTF8(little2_)
723 DEFINE_UTF16_TO_UTF16(little2_)
729 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
730 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
731 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
733 DEFINE_UTF16_TO_UTF8(big2_)
734 DEFINE_UTF16_TO_UTF16(big2_)
740 #define LITTLE2_BYTE_TYPE(enc, p) \
741 ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
742 : unicode_byte_type((p)[1], (p)[0]))
743 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
744 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
745 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
746 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
747 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
748 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
752 static int PTRFASTCALL
753 little2_byteType(const ENCODING *enc, const char *p) {
754 return LITTLE2_BYTE_TYPE(enc, p);
757 static int PTRFASTCALL
758 little2_byteToAscii(const ENCODING *enc, const char *p) {
760 return LITTLE2_BYTE_TO_ASCII(p);
764 little2_charMatches(const ENCODING *enc, const char *p, int c) {
766 return LITTLE2_CHAR_MATCHES(p, c);
769 static int PTRFASTCALL
770 little2_isNameMin(const ENCODING *enc, const char *p) {
772 return LITTLE2_IS_NAME_CHAR_MINBPC(p);
775 static int PTRFASTCALL
776 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
778 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
782 # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
784 #else /* not XML_MIN_SIZE */
787 # define PREFIX(ident) little2_##ident
788 # define MINBPC(enc) 2
789 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
790 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
791 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
792 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
793 # define IS_NAME_CHAR(enc, p, n) 0
794 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
795 # define IS_NMSTRT_CHAR(enc, p, n) (0)
796 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
798 # define XML_TOK_IMPL_C
799 # include "xmltok_impl.c"
800 # undef XML_TOK_IMPL_C
804 # undef BYTE_TO_ASCII
807 # undef IS_NAME_CHAR_MINBPC
808 # undef IS_NMSTRT_CHAR
809 # undef IS_NMSTRT_CHAR_MINBPC
810 # undef IS_INVALID_CHAR
812 #endif /* not XML_MIN_SIZE */
816 static const struct normal_encoding little2_encoding_ns
818 # if BYTEORDER == 1234
825 # include "asciitab.h"
826 # include "latin1tab.h"
828 STANDARD_VTABLE(little2_) NULL_VTABLE};
832 static const struct normal_encoding little2_encoding
834 #if BYTEORDER == 1234
841 #define BT_COLON BT_NMSTRT
842 #include "asciitab.h"
844 #include "latin1tab.h"
846 STANDARD_VTABLE(little2_) NULL_VTABLE};
848 #if BYTEORDER != 4321
852 static const struct normal_encoding internal_little2_encoding_ns
853 = {{VTABLE, 2, 0, 1},
855 # include "iasciitab.h"
856 # include "latin1tab.h"
858 STANDARD_VTABLE(little2_) NULL_VTABLE};
862 static const struct normal_encoding internal_little2_encoding
863 = {{VTABLE, 2, 0, 1},
865 # define BT_COLON BT_NMSTRT
866 # include "iasciitab.h"
868 # include "latin1tab.h"
870 STANDARD_VTABLE(little2_) NULL_VTABLE};
874 #define BIG2_BYTE_TYPE(enc, p) \
876 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
877 : unicode_byte_type((p)[0], (p)[1]))
878 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
879 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
880 #define BIG2_IS_NAME_CHAR_MINBPC(p) \
881 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
882 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
883 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
887 static int PTRFASTCALL
888 big2_byteType(const ENCODING *enc, const char *p) {
889 return BIG2_BYTE_TYPE(enc, p);
892 static int PTRFASTCALL
893 big2_byteToAscii(const ENCODING *enc, const char *p) {
895 return BIG2_BYTE_TO_ASCII(p);
899 big2_charMatches(const ENCODING *enc, const char *p, int c) {
901 return BIG2_CHAR_MATCHES(p, c);
904 static int PTRFASTCALL
905 big2_isNameMin(const ENCODING *enc, const char *p) {
907 return BIG2_IS_NAME_CHAR_MINBPC(p);
910 static int PTRFASTCALL
911 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
913 return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
917 # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
919 #else /* not XML_MIN_SIZE */
922 # define PREFIX(ident) big2_##ident
923 # define MINBPC(enc) 2
924 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
925 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
926 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
927 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
928 # define IS_NAME_CHAR(enc, p, n) 0
929 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
930 # define IS_NMSTRT_CHAR(enc, p, n) (0)
931 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
933 # define XML_TOK_IMPL_C
934 # include "xmltok_impl.c"
935 # undef XML_TOK_IMPL_C
939 # undef BYTE_TO_ASCII
942 # undef IS_NAME_CHAR_MINBPC
943 # undef IS_NMSTRT_CHAR
944 # undef IS_NMSTRT_CHAR_MINBPC
945 # undef IS_INVALID_CHAR
947 #endif /* not XML_MIN_SIZE */
951 static const struct normal_encoding big2_encoding_ns
953 # if BYTEORDER == 4321
960 # include "asciitab.h"
961 # include "latin1tab.h"
963 STANDARD_VTABLE(big2_) NULL_VTABLE};
967 static const struct normal_encoding big2_encoding
969 #if BYTEORDER == 4321
976 #define BT_COLON BT_NMSTRT
977 #include "asciitab.h"
979 #include "latin1tab.h"
981 STANDARD_VTABLE(big2_) NULL_VTABLE};
983 #if BYTEORDER != 1234
987 static const struct normal_encoding internal_big2_encoding_ns
988 = {{VTABLE, 2, 0, 1},
990 # include "iasciitab.h"
991 # include "latin1tab.h"
993 STANDARD_VTABLE(big2_) NULL_VTABLE};
997 static const struct normal_encoding internal_big2_encoding
998 = {{VTABLE, 2, 0, 1},
1000 # define BT_COLON BT_NMSTRT
1001 # include "iasciitab.h"
1003 # include "latin1tab.h"
1005 STANDARD_VTABLE(big2_) NULL_VTABLE};
1012 streqci(const char *s1, const char *s2) {
1016 if (ASCII_a <= c1 && c1 <= ASCII_z)
1017 c1 += ASCII_A - ASCII_a;
1018 if (ASCII_a <= c2 && c2 <= ASCII_z)
1019 /* The following line will never get executed. streqci() is
1020 * only called from two places, both of which guarantee to put
1021 * upper-case strings into s2.
1023 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1033 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1036 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1040 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1043 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1062 /* Return 1 if there's just optional white space or there's an S
1063 followed by name=val.
1066 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1067 const char **namePtr, const char **nameEndPtr,
1068 const char **valPtr, const char **nextTokPtr) {
1075 if (! isSpace(toAscii(enc, ptr, end))) {
1080 ptr += enc->minBytesPerChar;
1081 } while (isSpace(toAscii(enc, ptr, end)));
1088 c = toAscii(enc, ptr, end);
1093 if (c == ASCII_EQUALS) {
1100 ptr += enc->minBytesPerChar;
1101 } while (isSpace(c = toAscii(enc, ptr, end)));
1102 if (c != ASCII_EQUALS) {
1108 ptr += enc->minBytesPerChar;
1110 if (ptr == *namePtr) {
1114 ptr += enc->minBytesPerChar;
1115 c = toAscii(enc, ptr, end);
1116 while (isSpace(c)) {
1117 ptr += enc->minBytesPerChar;
1118 c = toAscii(enc, ptr, end);
1120 if (c != ASCII_QUOT && c != ASCII_APOS) {
1125 ptr += enc->minBytesPerChar;
1127 for (;; ptr += enc->minBytesPerChar) {
1128 c = toAscii(enc, ptr, end);
1131 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1132 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1133 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1138 *nextTokPtr = ptr + enc->minBytesPerChar;
1142 static const char KW_version[]
1143 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1145 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1146 ASCII_i, ASCII_n, ASCII_g, '\0'};
1148 static const char KW_standalone[]
1149 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1150 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1152 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1154 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1157 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1159 int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1160 const char *end, const char **badPtr, const char **versionPtr,
1161 const char **versionEndPtr, const char **encodingName,
1162 const ENCODING **encoding, int *standalone) {
1163 const char *val = NULL;
1164 const char *name = NULL;
1165 const char *nameEnd = NULL;
1166 ptr += 5 * enc->minBytesPerChar;
1167 end -= 2 * enc->minBytesPerChar;
1168 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1173 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1174 if (! isGeneralTextEntity) {
1182 *versionEndPtr = ptr;
1183 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1188 if (isGeneralTextEntity) {
1189 /* a TextDecl must have an EncodingDecl */
1196 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1197 int c = toAscii(enc, val, end);
1198 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1203 *encodingName = val;
1205 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1206 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1213 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1214 || isGeneralTextEntity) {
1218 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1221 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1228 while (isSpace(toAscii(enc, ptr, end)))
1229 ptr += enc->minBytesPerChar;
1238 checkCharRefNumber(int result) {
1239 switch (result >> 8) {
1250 if (latin1_encoding.type[result] == BT_NONXML)
1254 if (result == 0xFFFE || result == 0xFFFF)
1262 XmlUtf8Encode(int c, char *buf) {
1264 /* minN is minimum legal resulting value for N byte sequence */
1271 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1273 buf[0] = (char)(c | UTF8_cval1);
1277 buf[0] = (char)((c >> 6) | UTF8_cval2);
1278 buf[1] = (char)((c & 0x3f) | 0x80);
1282 buf[0] = (char)((c >> 12) | UTF8_cval3);
1283 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1284 buf[2] = (char)((c & 0x3f) | 0x80);
1288 buf[0] = (char)((c >> 18) | UTF8_cval4);
1289 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1290 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1291 buf[3] = (char)((c & 0x3f) | 0x80);
1294 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1298 XmlUtf16Encode(int charNum, unsigned short *buf) {
1301 if (charNum < 0x10000) {
1302 buf[0] = (unsigned short)charNum;
1305 if (charNum < 0x110000) {
1307 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1308 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1314 struct unknown_encoding {
1315 struct normal_encoding normal;
1318 unsigned short utf16[256];
1322 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1325 XmlSizeOfUnknownEncoding(void) {
1326 return sizeof(struct unknown_encoding);
1329 static int PTRFASTCALL
1330 unknown_isName(const ENCODING *enc, const char *p) {
1331 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1332 int c = uenc->convert(uenc->userData, p);
1335 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1338 static int PTRFASTCALL
1339 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1340 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1341 int c = uenc->convert(uenc->userData, p);
1344 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1347 static int PTRFASTCALL
1348 unknown_isInvalid(const ENCODING *enc, const char *p) {
1349 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1350 int c = uenc->convert(uenc->userData, p);
1351 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1354 static enum XML_Convert_Result PTRCALL
1355 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1356 char **toP, const char *toLim) {
1357 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1358 char buf[XML_UTF8_ENCODE_MAX];
1362 if (*fromP == fromLim)
1363 return XML_CONVERT_COMPLETED;
1364 utf8 = uenc->utf8[(unsigned char)**fromP];
1367 int c = uenc->convert(uenc->userData, *fromP);
1368 n = XmlUtf8Encode(c, buf);
1369 if (n > toLim - *toP)
1370 return XML_CONVERT_OUTPUT_EXHAUSTED;
1372 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1375 if (n > toLim - *toP)
1376 return XML_CONVERT_OUTPUT_EXHAUSTED;
1379 memcpy(*toP, utf8, n);
1384 static enum XML_Convert_Result PTRCALL
1385 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1386 unsigned short **toP, const unsigned short *toLim) {
1387 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1388 while (*fromP < fromLim && *toP < toLim) {
1389 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1391 c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1392 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1399 if ((*toP == toLim) && (*fromP < fromLim))
1400 return XML_CONVERT_OUTPUT_EXHAUSTED;
1402 return XML_CONVERT_COMPLETED;
1406 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1409 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1410 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1411 for (i = 0; i < 128; i++)
1412 if (latin1_encoding.type[i] != BT_OTHER
1413 && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1415 for (i = 0; i < 256; i++) {
1418 e->normal.type[i] = BT_MALFORM;
1419 /* This shouldn't really get used. */
1420 e->utf16[i] = 0xFFFF;
1426 /* Multi-byte sequences need a converter function */
1429 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1432 } else if (c < 0x80) {
1433 if (latin1_encoding.type[c] != BT_OTHER
1434 && latin1_encoding.type[c] != BT_NONXML && c != i)
1436 e->normal.type[i] = latin1_encoding.type[c];
1438 e->utf8[i][1] = (char)c;
1439 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1440 } else if (checkCharRefNumber(c) < 0) {
1441 e->normal.type[i] = BT_NONXML;
1442 /* This shouldn't really get used. */
1443 e->utf16[i] = 0xFFFF;
1449 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1450 e->normal.type[i] = BT_NMSTRT;
1451 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1452 e->normal.type[i] = BT_NAME;
1454 e->normal.type[i] = BT_OTHER;
1455 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1456 e->utf16[i] = (unsigned short)c;
1459 e->userData = userData;
1460 e->convert = convert;
1462 e->normal.isName2 = unknown_isName;
1463 e->normal.isName3 = unknown_isName;
1464 e->normal.isName4 = unknown_isName;
1465 e->normal.isNmstrt2 = unknown_isNmstrt;
1466 e->normal.isNmstrt3 = unknown_isNmstrt;
1467 e->normal.isNmstrt4 = unknown_isNmstrt;
1468 e->normal.isInvalid2 = unknown_isInvalid;
1469 e->normal.isInvalid3 = unknown_isInvalid;
1470 e->normal.isInvalid4 = unknown_isInvalid;
1472 e->normal.enc.utf8Convert = unknown_toUtf8;
1473 e->normal.enc.utf16Convert = unknown_toUtf16;
1474 return &(e->normal.enc);
1477 /* If this enumeration is changed, getEncodingIndex and encodings
1478 must also be changed. */
1487 /* must match encodingNames up to here */
1491 static const char KW_ISO_8859_1[]
1492 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8,
1493 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'};
1494 static const char KW_US_ASCII[]
1495 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1496 ASCII_C, ASCII_I, ASCII_I, '\0'};
1497 static const char KW_UTF_8[]
1498 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1499 static const char KW_UTF_16[]
1500 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1501 static const char KW_UTF_16BE[]
1502 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1503 ASCII_6, ASCII_B, ASCII_E, '\0'};
1504 static const char KW_UTF_16LE[]
1505 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1506 ASCII_6, ASCII_L, ASCII_E, '\0'};
1509 getEncodingIndex(const char *name) {
1510 static const char *const encodingNames[] = {
1511 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1516 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1517 if (streqci(name, encodingNames[i]))
1522 /* For binary compatibility, we store the index of the encoding
1523 specified at initialization in the isUtf16 member.
1526 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1527 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1529 /* This is what detects the encoding. encodingTable maps from
1530 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1531 the external (protocol) specified encoding; state is
1532 XML_CONTENT_STATE if we're parsing an external text entity, and
1533 XML_PROLOG_STATE otherwise.
1537 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1538 int state, const char *ptr, const char *end, const char **nextTokPtr) {
1539 const ENCODING **encPtr;
1542 return XML_TOK_NONE;
1543 encPtr = enc->encPtr;
1544 if (ptr + 1 == end) {
1545 /* only a single byte available for auto-detection */
1546 #ifndef XML_DTD /* FIXME */
1547 /* a well-formed document entity must have more than one byte */
1548 if (state != XML_CONTENT_STATE)
1549 return XML_TOK_PARTIAL;
1551 /* so we're parsing an external text entity... */
1552 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1553 switch (INIT_ENC_INDEX(enc)) {
1557 return XML_TOK_PARTIAL;
1559 switch ((unsigned char)*ptr) {
1562 case 0xEF: /* possibly first byte of UTF-8 BOM */
1563 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1568 return XML_TOK_PARTIAL;
1571 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1573 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1575 *nextTokPtr = ptr + 2;
1576 *encPtr = encodingTable[UTF_16BE_ENC];
1578 /* 00 3C is handled in the default case */
1580 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1581 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1582 && state == XML_CONTENT_STATE)
1584 *encPtr = encodingTable[UTF_16LE_ENC];
1585 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1587 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1589 *nextTokPtr = ptr + 2;
1590 *encPtr = encodingTable[UTF_16LE_ENC];
1593 /* Maybe a UTF-8 BOM (EF BB BF) */
1594 /* If there's an explicitly specified (external) encoding
1595 of ISO-8859-1 or some flavour of UTF-16
1596 and this is an external text entity,
1597 don't look for the BOM,
1598 because it might be a legal data.
1600 if (state == XML_CONTENT_STATE) {
1601 int e = INIT_ENC_INDEX(enc);
1602 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1607 return XML_TOK_PARTIAL;
1608 if ((unsigned char)ptr[2] == 0xBF) {
1609 *nextTokPtr = ptr + 3;
1610 *encPtr = encodingTable[UTF_8_ENC];
1615 if (ptr[0] == '\0') {
1616 /* 0 isn't a legal data character. Furthermore a document
1617 entity can only start with ASCII characters. So the only
1618 way this can fail to be big-endian UTF-16 if it it's an
1619 external parsed general entity that's labelled as
1622 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1624 *encPtr = encodingTable[UTF_16BE_ENC];
1625 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1626 } else if (ptr[1] == '\0') {
1627 /* We could recover here in the case:
1628 - parsing an external entity
1630 - no externally specified encoding
1631 - no encoding declaration
1632 by assuming UTF-16LE. But we don't, because this would mean when
1633 presented just with a single byte, we couldn't reliably determine
1634 whether we needed further bytes.
1636 if (state == XML_CONTENT_STATE)
1638 *encPtr = encodingTable[UTF_16LE_ENC];
1639 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1644 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1645 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1650 #define XML_TOK_NS_C
1651 #include "xmltok_ns.c"
1658 # define NS(x) x##NS
1659 # define ns(x) x##_ns
1661 # define XML_TOK_NS_C
1662 # include "xmltok_ns.c"
1663 # undef XML_TOK_NS_C
1669 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1671 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1673 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;