2 * Copyright 2015 Matthew Dillon <dillon@backplane.com> (mbintowcr, wcrtombin)
3 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
4 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
5 * Copyright (c) 2002-2004 Tim J. Robbins
8 * Copyright (c) 2011 The FreeBSD Foundation
10 * Portions of this software were developed by David Chisnall
11 * under sponsorship from the FreeBSD Foundation.
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * WCSBIN_EOF - Indicate EOF on input buffer.
38 * WCSBIN_SURRO - Pass-through surrogate space (typically if the UTF-8
39 * has already been escaped), on bytes-to-wchars and
40 * wchars-to-bytes. Escaping of other illegal codes will
41 * still occur on input but de-escaping will not occur
42 * on output (they will remain in the surrogate space).
44 * WCSBIN_LONGCODES - Allow 4-byte >= 0x10FFFF, 5-byte and 6-byte sequences
45 * (normally illegal), otherwise escape it on input
48 * WCSBIN_STRICT - Allow byte-to-wide conversions to fail.
51 #include <sys/param.h>
61 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
62 size_t, mbstate_t * __restrict);
63 static int _UTF8_mbsinit(const mbstate_t *);
64 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict,
65 const char ** __restrict, size_t, size_t,
66 mbstate_t * __restrict);
67 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t,
68 mbstate_t * __restrict);
69 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
70 size_t, size_t, mbstate_t * __restrict);
71 static size_t _UTF8_mbintowcr(wchar_t * __restrict dst,
72 const char * __restrict src,
73 size_t dlen, size_t *slen, int flags);
74 static size_t _UTF8_wcrtombin(char * __restrict dst,
75 const wchar_t * __restrict src,
76 size_t dlen, size_t *slen, int flags);
85 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
88 l->__mbrtowc = _UTF8_mbrtowc;
89 l->__wcrtomb = _UTF8_wcrtomb;
90 l->__mbsinit = _UTF8_mbsinit;
91 l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
92 l->__wcsnrtombs = _UTF8_wcsnrtombs;
93 l->__mbintowcr = _UTF8_mbintowcr;
94 l->__wcrtombin = _UTF8_wcrtombin;
98 * UCS-4 encoding used as the internal representation, so
99 * slots 0x0080-0x00FF are occuped and must be excluded
100 * from the single byte ctype by setting the limit.
102 l->__mb_sb_limit = 128;
108 _UTF8_mbsinit(const mbstate_t *ps)
111 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
115 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
116 mbstate_t * __restrict ps)
119 int ch, i, mask, want;
122 us = (_UTF8State *)ps;
124 if (us->want < 0 || us->want > 4) {
136 /* Incomplete multibyte sequence */
141 * Determine the number of octets that make up this character
142 * from the first octet, and a mask that extracts the
143 * interesting bits of the first octet. We already know
144 * the character is at least two bytes long.
146 * We also specify a lower bound for the character code to
147 * detect redundant, non-"shortest form" encodings. For
148 * example, the sequence C0 80 is _not_ a legal representation
149 * of the null character. This enforces a 1-to-1 mapping
150 * between character codes and their multibyte representations.
152 ch = (unsigned char)*s;
153 if ((ch & 0x80) == 0) {
154 /* Fast path for plain ASCII characters. */
157 return (ch != '\0' ? 1 : 0);
159 if ((ch & 0xe0) == 0xc0) {
163 } else if ((ch & 0xf0) == 0xe0) {
167 } else if ((ch & 0xf8) == 0xf0) {
173 * Malformed input; input is not UTF-8.
184 * Decode the octet sequence representing the character in chunks
185 * of 6 bits, most significant first.
188 wch = (unsigned char)*s++ & mask;
192 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
193 if ((*s & 0xc0) != 0x80) {
195 * Malformed input; bad characters in the middle
205 /* Incomplete multibyte sequence. */
211 if (wch < lbound || wch > 0x10ffff) {
213 * Malformed input; redundant encoding or illegal
222 return (wch == L'\0' ? 0 : want);
226 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
227 size_t nms, size_t len, mbstate_t * __restrict ps)
235 us = (_UTF8State *)ps;
242 * The fast path in the loop below is not safe if an ASCII
243 * character appears as anything but the first byte of a
244 * multibyte sequence. Check now to avoid doing it in the loop.
246 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
251 if (nms > 0 && (signed char)*s > 0)
253 * Fast path for plain ASCII characters
257 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
259 /* Invalid sequence - mbrtowc() sets errno. */
261 else if (nb == 0 || nb == (size_t)-2)
271 * The fast path in the loop below is not safe if an ASCII
272 * character appears as anything but the first byte of a
273 * multibyte sequence. Check now to avoid doing it in the loop.
275 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
280 if (nms > 0 && (signed char)*s > 0) {
282 * Fast path for plain ASCII characters
287 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
291 } else if (nb == (size_t)-2) {
294 } else if (nb == 0) {
308 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
314 us = (_UTF8State *)ps;
322 /* Reset to initial shift state (no-op) */
326 * Determine the number of octets needed to represent this character.
327 * We always output the shortest sequence possible. Also specify the
328 * first few bits of the first octet, which contains the information
329 * about the sequence length.
331 if ((wc & ~0x7f) == 0) {
332 /* Fast path for plain ASCII characters. */
335 } else if ((wc & ~0x7ff) == 0) {
338 } else if ((wc & ~0xffff) == 0) {
341 } else if (wc <= 0x10ffff) {
350 * Output the octets representing the character in chunks
351 * of 6 bits, least significant last. The first octet is
352 * a special case because it contains the sequence length
355 for (i = len - 1; i > 0; i--) {
356 s[i] = (wc & 0x3f) | 0x80;
359 *s = (wc & 0xff) | lead;
365 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
366 size_t nwc, size_t len, mbstate_t * __restrict ps)
369 char buf[MB_LEN_MAX];
374 us = (_UTF8State *)ps;
386 if (0 <= *s && *s < 0x80)
387 /* Fast path for plain ASCII characters. */
389 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
391 /* Invalid character - wcrtomb() sets errno. */
394 return (nbytes + nb - 1);
401 while (len > 0 && nwc-- > 0) {
402 if (0 <= *s && *s < 0x80) {
403 /* Fast path for plain ASCII characters. */
406 } else if (len > (size_t)MB_CUR_MAX) {
407 /* Enough space to translate in-place. */
408 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
414 * May not be enough space; use temp. buffer.
416 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
421 /* MB sequence for character won't fit. */
423 (void) memcpy(dst, buf, nb);
427 return (nbytes + nb - 1);
439 * Clean binary to wchar buffer conversions. This is basically like a normal
440 * buffer conversion but with a sane argument API and escaping. See none.c
441 * for a more complete description.
444 _UTF8_mbintowcr(wchar_t * __restrict dst, const char * __restrict src,
445 size_t dlen, size_t *slen, int flags)
454 for (i = j = 0; i < n; ++i) {
457 ch = (unsigned char)src[i];
459 if ((ch & 0x80) == 0) {
460 /* Fast path for plain ASCII characters. */
466 if ((ch & 0xe0) == 0xc0) {
470 } else if ((ch & 0xf0) == 0xe0) {
474 } else if ((ch & 0xf8) == 0xf0) {
478 } else if ((ch & 0xfc) == 0xf8) {
479 /* normally illegal, handled down below */
483 } else if ((ch & 0xfe) == 0xfc) {
484 /* normally illegal, handled down below */
490 * Malformed input; input is not UTF-8, escape
493 if (flags & WCSBIN_STRICT) {
501 dst[j] = 0xDC00 | ch;
507 * Construct wchar_t from multibyte sequence.
510 for (k = 1; k < want; ++k) {
512 * Stop if not enough input (don't do this early
513 * so we can detect illegal characters as they occur
516 * If termination is requested force-escape all chars.
519 if (flags & WCSBIN_EOF) {
527 if ((ch & 0xc0) != 0x80) {
529 * Malformed input, bad characters in the
530 * middle of a multibyte sequence. Escape
533 if (flags & WCSBIN_STRICT) {
541 dst[j] = 0xDC00 | (unsigned char)src[i];
550 * Check validity of the wchar. If invalid we could escape
551 * just the first character and loop up, but it ought to be
552 * more readable if we escape all the chars in the sequence
553 * (since they are all >= 0x80 and might represent a legacy
554 * 5-byte or 6-byte code).
557 ((flags & WCSBIN_LONGCODES) == 0 && wch > 0x10ffff)) {
562 * Check if wch is a surrogate code (which also encloses our
563 * UTF-8B escaping range). This is normally illegal in UTF8.
564 * If it is, we need to escape each characer in the sequence.
565 * Breakout if there isn't enough output buffer space.
567 * If (flags & WCSBIN_SURRO) the caller wishes to accept
568 * surrogate codes, i.e. the input might potentially already
569 * be escaped UTF8-B or unchecked UTF-16 that was converted
572 if ((flags & WCSBIN_SURRO) == 0 &&
573 wch >= 0xD800 && wch <= 0xDFFF) {
577 if (flags & WCSBIN_STRICT) {
584 for (k = 0; k < want; ++k) {
587 (unsigned char)src[i+k];
608 _UTF8_wcrtombin(char * __restrict dst, const wchar_t * __restrict src,
609 size_t dlen, size_t *slen, int flags)
619 for (i = j = 0; i < n; ++i) {
624 if ((wc & ~0x7f) == 0) {
625 /* Fast path for plain ASCII characters. */
627 dst[j] = (unsigned char)wc;
631 if ((wc & ~0x7ff) == 0) {
634 } else if (wc >= 0xDC80 && wc <= 0xDCFF &&
635 (flags & WCSBIN_SURRO) == 0) {
636 if (flags & WCSBIN_STRICT) {
638 * STRICT without SURRO is an error for
648 dst[j] = (unsigned char)wc;
651 } else if ((wc & ~0xffff) == 0) {
652 if (wc >= 0xD800 && wc <= 0xDFFF &&
653 (flags & (WCSBIN_SURRO | WCSBIN_STRICT)) ==
656 * Surrogates in general are an error
657 * if STRICT is specified and SURRO is not
668 } else if (wc <= 0x10ffff) {
671 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x200000) {
672 /* normally illegal */
675 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x4000000) {
676 /* normally illegal */
679 } else if ((flags & WCSBIN_LONGCODES) &&
680 (uint32_t)wc < 0x80000000U) {
681 /* normally illegal */
689 /* stop here, process error on next loop */
694 * Output the octets representing the character in chunks
695 * of 6 bits, least significant last. The first octet is
696 * a special case because it contains the sequence length
705 dst[k + len] = (wc & 0x3f) | 0x80;
708 dst[k] = (wc & 0xff) | lead;
717 utf8towcr(wchar_t * __restrict dst, const char * __restrict src,
718 size_t dlen, size_t *slen, int flags)
720 return _UTF8_mbintowcr(dst, src, dlen, slen, flags);
724 wcrtoutf8(char * __restrict dst, const wchar_t * __restrict src,
725 size_t dlen, size_t *slen, int flags)
727 return _UTF8_wcrtombin(dst, src, dlen, slen, flags);