2 * Copyright 2015 Matthew Dillon <dillon@backplane.com> (mbintowcr, wcrtombin)
3 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
4 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
5 * Copyright (c) 2002-2004 Tim J. Robbins
8 * Copyright (c) 2011 The FreeBSD Foundation
10 * Portions of this software were developed by David Chisnall
11 * under sponsorship from the FreeBSD Foundation.
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * WCSBIN_EOF - Indicate EOF on input buffer.
38 * WCSBIN_SURRO - Pass-through surrogate space (typically if the UTF-8
39 * has already been escaped), on bytes-to-wchars and
40 * wchars-to-bytes. Escaping of other illegal codes will
41 * still occur on input but de-escaping will not occur
42 * on output (they will remain in the surrogate space).
44 * WCSBIN_LONGCODES - Allow 4-byte >= 0x10FFFF, 5-byte and 6-byte sequences
45 * (normally illegal), otherwise escape it on input
48 * WCSBIN_STRICT - Allow byte-to-wide conversions to fail.
51 #include <sys/param.h>
61 extern int __mb_sb_limit;
63 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
64 size_t, mbstate_t * __restrict);
65 static int _UTF8_mbsinit(const mbstate_t *);
66 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict,
67 const char ** __restrict, size_t, size_t,
68 mbstate_t * __restrict);
69 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t,
70 mbstate_t * __restrict);
71 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
72 size_t, size_t, mbstate_t * __restrict);
73 static size_t _UTF8_mbintowcr(wchar_t * __restrict dst,
74 const char * __restrict src,
75 size_t dlen, size_t *slen, int flags);
76 static size_t _UTF8_wcrtombin(char * __restrict dst,
77 const wchar_t * __restrict src,
78 size_t dlen, size_t *slen, int flags);
87 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
90 l->__mbrtowc = _UTF8_mbrtowc;
91 l->__wcrtomb = _UTF8_wcrtomb;
92 l->__mbsinit = _UTF8_mbsinit;
93 l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
94 l->__wcsnrtombs = _UTF8_wcsnrtombs;
95 l->__mbintowcr = _UTF8_mbintowcr;
96 l->__wcrtombin = _UTF8_wcrtombin;
100 * UCS-4 encoding used as the internal representation, so
101 * slots 0x0080-0x00FF are occuped and must be excluded
102 * from the single byte ctype by setting the limit.
104 l->__mb_sb_limit = 128;
110 _UTF8_mbsinit(const mbstate_t *ps)
113 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
117 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
118 mbstate_t * __restrict ps)
121 int ch, i, mask, want;
124 us = (_UTF8State *)ps;
126 if (us->want < 0 || us->want > 4) {
138 /* Incomplete multibyte sequence */
143 * Determine the number of octets that make up this character
144 * from the first octet, and a mask that extracts the
145 * interesting bits of the first octet. We already know
146 * the character is at least two bytes long.
148 * We also specify a lower bound for the character code to
149 * detect redundant, non-"shortest form" encodings. For
150 * example, the sequence C0 80 is _not_ a legal representation
151 * of the null character. This enforces a 1-to-1 mapping
152 * between character codes and their multibyte representations.
154 ch = (unsigned char)*s;
155 if ((ch & 0x80) == 0) {
156 /* Fast path for plain ASCII characters. */
159 return (ch != '\0' ? 1 : 0);
161 if ((ch & 0xe0) == 0xc0) {
165 } else if ((ch & 0xf0) == 0xe0) {
169 } else if ((ch & 0xf8) == 0xf0) {
175 * Malformed input; input is not UTF-8.
186 * Decode the octet sequence representing the character in chunks
187 * of 6 bits, most significant first.
190 wch = (unsigned char)*s++ & mask;
194 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
195 if ((*s & 0xc0) != 0x80) {
197 * Malformed input; bad characters in the middle
207 /* Incomplete multibyte sequence. */
213 if (wch < lbound || (wch & ~0x10ffff)) {
215 * Malformed input; redundant encoding or illegal
224 return (wch == L'\0' ? 0 : want);
228 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
229 size_t nms, size_t len, mbstate_t * __restrict ps)
237 us = (_UTF8State *)ps;
244 * The fast path in the loop below is not safe if an ASCII
245 * character appears as anything but the first byte of a
246 * multibyte sequence. Check now to avoid doing it in the loop.
248 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
253 if (nms > 0 && (signed char)*s > 0)
255 * Fast path for plain ASCII characters
259 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
261 /* Invalid sequence - mbrtowc() sets errno. */
263 else if (nb == 0 || nb == (size_t)-2)
273 * The fast path in the loop below is not safe if an ASCII
274 * character appears as anything but the first byte of a
275 * multibyte sequence. Check now to avoid doing it in the loop.
277 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
282 if (nms > 0 && (signed char)*s > 0) {
284 * Fast path for plain ASCII characters
289 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
293 } else if (nb == (size_t)-2) {
296 } else if (nb == 0) {
310 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
316 us = (_UTF8State *)ps;
324 /* Reset to initial shift state (no-op) */
328 * Determine the number of octets needed to represent this character.
329 * We always output the shortest sequence possible. Also specify the
330 * first few bits of the first octet, which contains the information
331 * about the sequence length.
333 if ((wc & ~0x7f) == 0) {
334 /* Fast path for plain ASCII characters. */
337 } else if ((wc & ~0x7ff) == 0) {
340 } else if ((wc & ~0xffff) == 0) {
343 } else if ((wc & ~0x10ffff) == 0) {
352 * Output the octets representing the character in chunks
353 * of 6 bits, least significant last. The first octet is
354 * a special case because it contains the sequence length
357 for (i = len - 1; i > 0; i--) {
358 s[i] = (wc & 0x3f) | 0x80;
361 *s = (wc & 0xff) | lead;
367 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
368 size_t nwc, size_t len, mbstate_t * __restrict ps)
371 char buf[MB_LEN_MAX];
376 us = (_UTF8State *)ps;
388 if (0 <= *s && *s < 0x80)
389 /* Fast path for plain ASCII characters. */
391 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
393 /* Invalid character - wcrtomb() sets errno. */
396 return (nbytes + nb - 1);
403 while (len > 0 && nwc-- > 0) {
404 if (0 <= *s && *s < 0x80) {
405 /* Fast path for plain ASCII characters. */
408 } else if (len > (size_t)MB_CUR_MAX) {
409 /* Enough space to translate in-place. */
410 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
416 * May not be enough space; use temp. buffer.
418 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
423 /* MB sequence for character won't fit. */
425 (void) memcpy(dst, buf, nb);
429 return (nbytes + nb - 1);
441 * Clean binary to wchar buffer conversions. This is basically like a normal
442 * buffer conversion but with a sane argument API and escaping. See none.c
443 * for a more complete description.
446 _UTF8_mbintowcr(wchar_t * __restrict dst, const char * __restrict src,
447 size_t dlen, size_t *slen, int flags)
456 for (i = j = 0; i < n; ++i) {
459 ch = (unsigned char)src[i];
461 if ((ch & 0x80) == 0) {
462 /* Fast path for plain ASCII characters. */
468 if ((ch & 0xe0) == 0xc0) {
472 } else if ((ch & 0xf0) == 0xe0) {
476 } else if ((ch & 0xf8) == 0xf0) {
480 } else if ((ch & 0xfc) == 0xf8) {
481 /* normally illegal, handled down below */
485 } else if ((ch & 0xfe) == 0xfc) {
486 /* normally illegal, handled down below */
492 * Malformed input; input is not UTF-8, escape
495 if (flags & WCSBIN_STRICT) {
503 dst[j] = 0xDC00 | ch;
509 * Construct wchar_t from multibyte sequence.
512 for (k = 1; k < want; ++k) {
514 * Stop if not enough input (don't do this early
515 * so we can detect illegal characters as they occur
518 * If termination is requested force-escape all chars.
521 if (flags & WCSBIN_EOF) {
529 if ((ch & 0xc0) != 0x80) {
531 * Malformed input, bad characters in the
532 * middle of a multibyte sequence. Escape
535 if (flags & WCSBIN_STRICT) {
543 dst[j] = 0xDC00 | (unsigned char)src[i];
552 * Check validity of the wchar. If invalid we could escape
553 * just the first character and loop up, but it ought to be
554 * more readable if we escape all the chars in the sequence
555 * (since they are all >= 0x80 and might represent a legacy
556 * 5-byte or 6-byte code).
559 ((flags & WCSBIN_LONGCODES) == 0 && (wch & ~0x10ffff)) ||
560 ((flags & WCSBIN_LONGCODES) == 0 && want >= 5)) {
565 * Check if wch is a surrogate code (which also encloses our
566 * UTF-8B escaping range). This is normally illegal in UTF8.
567 * If it is, we need to escape each characer in the sequence.
568 * Breakout if there isn't enough output buffer space.
570 * If (flags & WCSBIN_SURRO) the caller wishes to accept
571 * surrogate codes, i.e. the input might potentially already
572 * be escaped UTF8-B or unchecked UTF-16 that was converted
575 if ((flags & WCSBIN_SURRO) == 0 &&
576 wch >= 0xD800 && wch <= 0xDFFF) {
580 if (flags & WCSBIN_STRICT) {
587 for (k = 0; k < want; ++k) {
590 (unsigned char)src[i+k];
611 _UTF8_wcrtombin(char * __restrict dst, const wchar_t * __restrict src,
612 size_t dlen, size_t *slen, int flags)
622 for (i = j = 0; i < n; ++i) {
627 if ((wc & ~0x7f) == 0) {
628 /* Fast path for plain ASCII characters. */
630 dst[j] = (unsigned char)wc;
634 if ((wc & ~0x7ff) == 0) {
637 } else if (wc >= 0xDC80 && wc <= 0xDCFF &&
638 (flags & WCSBIN_SURRO) == 0) {
639 if (flags & WCSBIN_STRICT) {
641 * STRICT without SURRO is an error for
651 dst[j] = (unsigned char)wc;
654 } else if ((wc & ~0xffff) == 0) {
655 if (wc >= 0xD800 && wc <= 0xDFFF &&
656 (flags & (WCSBIN_SURRO | WCSBIN_STRICT)) ==
659 * Surrogates in general are an error
660 * if STRICT is specified and SURRO is not
671 } else if ((wc & ~0x10ffff) == 0) {
674 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x200000) {
675 /* normally illegal */
678 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x4000000) {
679 /* normally illegal */
682 } else if ((flags & WCSBIN_LONGCODES) &&
683 (uint32_t)wc < 0x80000000U) {
684 /* normally illegal */
692 /* stop here, process error on next loop */
697 * Output the octets representing the character in chunks
698 * of 6 bits, least significant last. The first octet is
699 * a special case because it contains the sequence length
708 dst[k + len] = (wc & 0x3f) | 0x80;
711 dst[k] = (wc & 0xff) | lead;
720 utf8towcr(wchar_t * __restrict dst, const char * __restrict src,
721 size_t dlen, size_t *slen, int flags)
723 return _UTF8_mbintowcr(dst, src, dlen, slen, flags);
727 wcrtoutf8(char * __restrict dst, const wchar_t * __restrict src,
728 size_t dlen, size_t *slen, int flags)
730 return _UTF8_wcrtombin(dst, src, dlen, slen, flags);