1 // Locale support (codecvt) -*- C++ -*-
3 // Copyright (C) 2015 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
26 #include <cstring> // std::memcpy, std::memcmp
27 #include <bits/stl_algobase.h> // std::max
29 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
30 namespace std _GLIBCXX_VISIBILITY(default)
32 _GLIBCXX_BEGIN_NAMESPACE_VERSION
36 // Largest code point that fits in a single UTF-16 code unit.
37 const char32_t max_single_utf16_unit = 0xFFFF;
39 const char32_t max_code_point = 0x10FFFF;
41 // The functions below rely on maxcode < incomplete_mb_character
42 // (which is enforced by the codecvt_utf* classes on construction).
43 const char32_t incomplete_mb_character = char32_t(-2);
44 const char32_t invalid_mb_sequence = char32_t(-1);
46 template<typename Elem>
52 Elem operator*() const { return *next; }
54 range& operator++() { ++next; return *this; }
56 size_t size() const { return end - next; }
59 // Multibyte sequences can have "header" consisting of Byte Order Mark
60 const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
61 const unsigned char utf16_bom[4] = { 0xFE, 0xFF };
62 const unsigned char utf16le_bom[4] = { 0xFF, 0xFE };
66 write_bom(range<char>& to, const unsigned char (&bom)[N])
70 memcpy(to.next, bom, N);
75 // If generate_header is set in mode write out UTF-8 BOM.
77 write_utf8_bom(range<char>& to, codecvt_mode mode)
79 if (mode & generate_header)
80 return write_bom(to, utf8_bom);
84 // If generate_header is set in mode write out the UTF-16 BOM indicated
85 // by whether little_endian is set in mode.
87 write_utf16_bom(range<char16_t>& to, codecvt_mode mode)
89 if (mode & generate_header)
93 auto* bom = (mode & little_endian) ? utf16le_bom : utf16_bom;
94 std::memcpy(to.next, bom, 2);
102 read_bom(range<const char>& from, const unsigned char (&bom)[N])
104 if (from.size() >= N && !memcmp(from.next, bom, N))
112 // If consume_header is set in mode update from.next to after any BOM.
114 read_utf8_bom(range<const char>& from, codecvt_mode mode)
116 if (mode & consume_header)
117 read_bom(from, utf8_bom);
120 // If consume_header is set in mode update from.next to after any BOM.
121 // Return little_endian iff the UTF-16LE BOM was present.
123 read_utf16_bom(range<const char16_t>& from, codecvt_mode mode)
125 if (mode & consume_header && from.size())
127 if (*from.next == 0xFEFF)
129 else if (*from.next == 0xFFFE)
132 return little_endian;
138 // Read a codepoint from a UTF-8 multibyte sequence.
139 // Updates from.next if the codepoint is not greater than maxcode.
140 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
142 read_utf8_code_point(range<const char>& from, unsigned long maxcode)
144 const size_t avail = from.size();
146 return incomplete_mb_character;
147 unsigned char c1 = from.next[0];
148 // https://en.wikipedia.org/wiki/UTF-8#Sample_code
154 else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
155 return invalid_mb_sequence;
156 else if (c1 < 0xE0) // 2-byte sequence
159 return incomplete_mb_character;
160 unsigned char c2 = from.next[1];
161 if ((c2 & 0xC0) != 0x80)
162 return invalid_mb_sequence;
163 char32_t c = (c1 << 6) + c2 - 0x3080;
168 else if (c1 < 0xF0) // 3-byte sequence
171 return incomplete_mb_character;
172 unsigned char c2 = from.next[1];
173 if ((c2 & 0xC0) != 0x80)
174 return invalid_mb_sequence;
175 if (c1 == 0xE0 && c2 < 0xA0) // overlong
176 return invalid_mb_sequence;
177 unsigned char c3 = from.next[2];
178 if ((c3 & 0xC0) != 0x80)
179 return invalid_mb_sequence;
180 char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
185 else if (c1 < 0xF5) // 4-byte sequence
188 return incomplete_mb_character;
189 unsigned char c2 = from.next[1];
190 if ((c2 & 0xC0) != 0x80)
191 return invalid_mb_sequence;
192 if (c1 == 0xF0 && c2 < 0x90) // overlong
193 return invalid_mb_sequence;
194 if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
195 return invalid_mb_sequence;
196 unsigned char c3 = from.next[2];
197 if ((c3 & 0xC0) != 0x80)
198 return invalid_mb_sequence;
199 unsigned char c4 = from.next[3];
200 if ((c4 & 0xC0) != 0x80)
201 return invalid_mb_sequence;
202 char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
208 return invalid_mb_sequence;
212 write_utf8_code_point(range<char>& to, char32_t code_point)
214 if (code_point < 0x80)
218 *to.next++ = code_point;
220 else if (code_point <= 0x7FF)
224 *to.next++ = (code_point >> 6) + 0xC0;
225 *to.next++ = (code_point & 0x3F) + 0x80;
227 else if (code_point <= 0xFFFF)
231 *to.next++ = (code_point >> 12) + 0xE0;
232 *to.next++ = ((code_point >> 6) & 0x3F) + 0x80;
233 *to.next++ = (code_point & 0x3F) + 0x80;
235 else if (code_point <= 0x10FFFF)
239 *to.next++ = (code_point >> 18) + 0xF0;
240 *to.next++ = ((code_point >> 12) & 0x3F) + 0x80;
241 *to.next++ = ((code_point >> 6) & 0x3F) + 0x80;
242 *to.next++ = (code_point & 0x3F) + 0x80;
250 adjust_byte_order(char16_t c, codecvt_mode mode)
252 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
253 return (mode & little_endian) ? __builtin_bswap16(c) : c;
255 return (mode & little_endian) ? c : __builtin_bswap16(c);
259 // Return true if c is a high-surrogate (aka leading) code point.
261 is_high_surrogate(char32_t c)
263 return c >= 0xD800 && c <= 0xDBFF;
266 // Return true if c is a low-surrogate (aka trailing) code point.
268 is_low_surrogate(char32_t c)
270 return c >= 0xDC00 && c <= 0xDFFF;
274 surrogate_pair_to_code_point(char32_t high, char32_t low)
276 return (high << 10) + low - 0x35FDC00;
279 // Read a codepoint from a UTF-16 multibyte sequence.
280 // The sequence's endianness is indicated by (mode & little_endian).
281 // Updates from.next if the codepoint is not greater than maxcode.
282 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
284 read_utf16_code_point(range<const char16_t>& from, unsigned long maxcode,
287 const size_t avail = from.size();
289 return incomplete_mb_character;
291 char32_t c = adjust_byte_order(from.next[0], mode);
292 if (is_high_surrogate(c))
295 return incomplete_mb_character;
296 const char16_t c2 = adjust_byte_order(from.next[1], mode);
297 if (is_low_surrogate(c2))
299 c = surrogate_pair_to_code_point(c, c2);
303 return invalid_mb_sequence;
305 else if (is_low_surrogate(c))
306 return invalid_mb_sequence;
314 write_utf16_code_point(range<C>& to, char32_t codepoint, codecvt_mode mode)
316 static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
318 if (codepoint < max_single_utf16_unit)
322 *to.next = adjust_byte_order(codepoint, mode);
327 else if (to.size() > 1)
329 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
330 const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
331 char16_t lead = LEAD_OFFSET + (codepoint >> 10);
332 char16_t trail = 0xDC00 + (codepoint & 0x3FF);
333 to.next[0] = adjust_byte_order(lead, mode);
334 to.next[1] = adjust_byte_order(trail, mode);
343 ucs4_in(range<const char>& from, range<char32_t>& to,
344 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
346 read_utf8_bom(from, mode);
347 while (from.size() && to.size())
349 const char32_t codepoint = read_utf8_code_point(from, maxcode);
350 if (codepoint == incomplete_mb_character)
351 return codecvt_base::partial;
352 if (codepoint > maxcode)
353 return codecvt_base::error;
354 *to.next++ = codepoint;
356 return from.size() ? codecvt_base::partial : codecvt_base::ok;
361 ucs4_out(range<const char32_t>& from, range<char>& to,
362 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
364 if (!write_utf8_bom(to, mode))
365 return codecvt_base::partial;
368 const char32_t c = from.next[0];
370 return codecvt_base::error;
371 if (!write_utf8_code_point(to, c))
372 return codecvt_base::partial;
375 return codecvt_base::ok;
380 ucs4_in(range<const char16_t>& from, range<char32_t>& to,
381 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
383 if (read_utf16_bom(from, mode) == little_endian)
384 mode = codecvt_mode(mode & little_endian);
385 while (from.size() && to.size())
387 const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
388 if (codepoint == incomplete_mb_character)
389 return codecvt_base::partial;
390 if (codepoint > maxcode)
391 return codecvt_base::error;
392 *to.next++ = codepoint;
394 return from.size() ? codecvt_base::partial : codecvt_base::ok;
399 ucs4_out(range<const char32_t>& from, range<char16_t>& to,
400 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
402 if (!write_utf16_bom(to, mode))
403 return codecvt_base::partial;
406 const char32_t c = from.next[0];
408 return codecvt_base::error;
409 if (!write_utf16_code_point(to, c, mode))
410 return codecvt_base::partial;
413 return codecvt_base::ok;
419 utf16_in(range<const char>& from, range<C>& to,
420 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
422 read_utf8_bom(from, mode);
423 while (from.size() && to.size())
425 const char* const first = from.next;
426 const char32_t codepoint = read_utf8_code_point(from, maxcode);
427 if (codepoint == incomplete_mb_character)
428 return codecvt_base::partial;
429 if (codepoint > maxcode)
430 return codecvt_base::error;
431 if (!write_utf16_code_point(to, codepoint, mode))
434 return codecvt_base::partial;
437 return codecvt_base::ok;
443 utf16_out(range<const C>& from, range<char>& to,
444 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
446 if (!write_utf8_bom(to, mode))
447 return codecvt_base::partial;
450 char32_t c = from.next[0];
452 if (is_high_surrogate(c))
455 return codecvt_base::ok; // stop converting at this point
457 const char32_t c2 = from.next[1];
458 if (is_low_surrogate(c2))
460 c = surrogate_pair_to_code_point(c, c2);
464 return codecvt_base::error;
466 else if (is_low_surrogate(c))
467 return codecvt_base::error;
469 return codecvt_base::error;
470 if (!write_utf8_code_point(to, c))
471 return codecvt_base::partial;
474 return codecvt_base::ok;
477 // return pos such that [begin,pos) is valid UTF-16 string no longer than max
479 utf16_span(const char* begin, const char* end, size_t max,
480 char32_t maxcode = max_code_point, codecvt_mode mode = {})
482 range<const char> from{ begin, end };
483 read_utf8_bom(from, mode);
485 while (count+1 < max)
487 char32_t c = read_utf8_code_point(from, maxcode);
490 else if (c > max_single_utf16_unit)
494 if (count+1 == max) // take one more character if it fits in a single unit
495 read_utf8_code_point(from, std::max(max_single_utf16_unit, maxcode));
501 ucs2_in(range<const char>& from, range<char16_t>& to,
502 char32_t maxcode = max_code_point, codecvt_mode mode = {})
504 return utf16_in(from, to, std::max(max_single_utf16_unit, maxcode), mode);
509 ucs2_out(range<const char16_t>& from, range<char>& to,
510 char32_t maxcode = max_code_point, codecvt_mode mode = {})
512 return utf16_out(from, to, std::max(max_single_utf16_unit, maxcode), mode);
517 ucs2_out(range<const char16_t>& from, range<char16_t>& to,
518 char32_t maxcode = max_code_point, codecvt_mode mode = {})
520 if (!write_utf16_bom(to, mode))
521 return codecvt_base::partial;
522 while (from.size() && to.size())
524 char16_t c = from.next[0];
525 if (is_high_surrogate(c))
526 return codecvt_base::error;
528 return codecvt_base::error;
529 *to.next++ = adjust_byte_order(c, mode);
532 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
537 ucs2_in(range<const char16_t>& from, range<char16_t>& to,
538 char32_t maxcode = max_code_point, codecvt_mode mode = {})
540 if (read_utf16_bom(from, mode) == little_endian)
541 mode = codecvt_mode(mode & little_endian);
542 maxcode = std::max(max_single_utf16_unit, maxcode);
543 while (from.size() && to.size())
545 const char32_t c = read_utf16_code_point(from, maxcode, mode);
546 if (c == incomplete_mb_character)
547 return codecvt_base::partial;
549 return codecvt_base::error;
552 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
556 ucs2_span(const char16_t* begin, const char16_t* end, size_t max,
557 char32_t maxcode, codecvt_mode mode)
559 range<const char16_t> from{ begin, end };
560 if (read_utf16_bom(from, mode) == little_endian)
561 mode = codecvt_mode(mode & little_endian);
562 maxcode = std::max(max_single_utf16_unit, maxcode);
564 while (max-- && c <= maxcode)
565 c = read_utf16_code_point(from, maxcode, mode);
570 ucs2_span(const char* begin, const char* end, size_t max,
571 char32_t maxcode, codecvt_mode mode)
573 range<const char> from{ begin, end };
574 read_utf8_bom(from, mode);
575 maxcode = std::max(max_single_utf16_unit, maxcode);
577 while (max-- && c <= maxcode)
578 c = read_utf8_code_point(from, maxcode);
582 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
584 ucs4_span(const char* begin, const char* end, size_t max,
585 char32_t maxcode = max_code_point, codecvt_mode mode = {})
587 range<const char> from{ begin, end };
588 read_utf8_bom(from, mode);
590 while (max-- && c <= maxcode)
591 c = read_utf8_code_point(from, maxcode);
595 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
597 ucs4_span(const char16_t* begin, const char16_t* end, size_t max,
598 char32_t maxcode = max_code_point, codecvt_mode mode = {})
600 range<const char16_t> from{ begin, end };
601 if (read_utf16_bom(from, mode) == little_endian)
602 mode = codecvt_mode(mode & little_endian);
604 while (max-- && c <= maxcode)
605 c = read_utf16_code_point(from, maxcode, mode);
610 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
611 // Converts from UTF-8 to UTF-16.
613 locale::id codecvt<char16_t, char, mbstate_t>::id;
615 codecvt<char16_t, char, mbstate_t>::~codecvt() { }
618 codecvt<char16_t, char, mbstate_t>::
620 const intern_type* __from,
621 const intern_type* __from_end, const intern_type*& __from_next,
622 extern_type* __to, extern_type* __to_end,
623 extern_type*& __to_next) const
625 range<const char16_t> from{ __from, __from_end };
626 range<char> to{ __to, __to_end };
627 auto res = utf16_out(from, to);
628 __from_next = from.next;
634 codecvt<char16_t, char, mbstate_t>::
635 do_unshift(state_type&, extern_type* __to, extern_type*,
636 extern_type*& __to_next) const
639 return noconv; // we don't use mbstate_t for the unicode facets
643 codecvt<char16_t, char, mbstate_t>::
644 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
645 const extern_type*& __from_next,
646 intern_type* __to, intern_type* __to_end,
647 intern_type*& __to_next) const
649 range<const char> from{ __from, __from_end };
650 range<char16_t> to{ __to, __to_end };
651 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
652 codecvt_mode mode = {};
654 codecvt_mode mode = little_endian;
656 auto res = utf16_in(from, to, max_code_point, mode);
657 __from_next = from.next;
663 codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
667 codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
671 codecvt<char16_t, char, mbstate_t>::
672 do_length(state_type&, const extern_type* __from,
673 const extern_type* __end, size_t __max) const
675 __end = utf16_span(__from, __end, __max);
676 return __end - __from;
680 codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
682 // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
683 // whereas 4 byte sequences require two 16-bit code units.
687 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
688 // Converts from UTF-8 to UTF-32 (aka UCS-4).
690 locale::id codecvt<char32_t, char, mbstate_t>::id;
692 codecvt<char32_t, char, mbstate_t>::~codecvt() { }
695 codecvt<char32_t, char, mbstate_t>::
696 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
697 const intern_type*& __from_next,
698 extern_type* __to, extern_type* __to_end,
699 extern_type*& __to_next) const
701 range<const char32_t> from{ __from, __from_end };
702 range<char> to{ __to, __to_end };
703 auto res = ucs4_out(from, to);
704 __from_next = from.next;
710 codecvt<char32_t, char, mbstate_t>::
711 do_unshift(state_type&, extern_type* __to, extern_type*,
712 extern_type*& __to_next) const
719 codecvt<char32_t, char, mbstate_t>::
720 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
721 const extern_type*& __from_next,
722 intern_type* __to, intern_type* __to_end,
723 intern_type*& __to_next) const
725 range<const char> from{ __from, __from_end };
726 range<char32_t> to{ __to, __to_end };
727 auto res = ucs4_in(from, to);
728 __from_next = from.next;
734 codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
738 codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
742 codecvt<char32_t, char, mbstate_t>::
743 do_length(state_type&, const extern_type* __from,
744 const extern_type* __end, size_t __max) const
746 __end = ucs4_span(__from, __end, __max);
747 return __end - __from;
751 codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
754 // Define members of codecvt_utf8<char16_t> base class implementation.
755 // Converts from UTF-8 to UCS-2.
757 __codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
760 __codecvt_utf8_base<char16_t>::
761 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
762 const intern_type*& __from_next,
763 extern_type* __to, extern_type* __to_end,
764 extern_type*& __to_next) const
766 range<const char16_t> from{ __from, __from_end };
767 range<char> to{ __to, __to_end };
768 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
769 __from_next = from.next;
775 __codecvt_utf8_base<char16_t>::
776 do_unshift(state_type&, extern_type* __to, extern_type*,
777 extern_type*& __to_next) const
784 __codecvt_utf8_base<char16_t>::
785 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
786 const extern_type*& __from_next,
787 intern_type* __to, intern_type* __to_end,
788 intern_type*& __to_next) const
790 range<const char> from{ __from, __from_end };
791 range<char16_t> to{ __to, __to_end };
792 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
793 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
794 mode = codecvt_mode(mode | little_endian);
796 auto res = ucs2_in(from, to, _M_maxcode, mode);
797 __from_next = from.next;
803 __codecvt_utf8_base<char16_t>::do_encoding() const throw()
807 __codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
811 __codecvt_utf8_base<char16_t>::
812 do_length(state_type&, const extern_type* __from,
813 const extern_type* __end, size_t __max) const
815 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
816 return __end - __from;
820 __codecvt_utf8_base<char16_t>::do_max_length() const throw()
823 // Define members of codecvt_utf8<char32_t> base class implementation.
824 // Converts from UTF-8 to UTF-32 (aka UCS-4).
826 __codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
829 __codecvt_utf8_base<char32_t>::
830 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
831 const intern_type*& __from_next,
832 extern_type* __to, extern_type* __to_end,
833 extern_type*& __to_next) const
835 range<const char32_t> from{ __from, __from_end };
836 range<char> to{ __to, __to_end };
837 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
838 __from_next = from.next;
844 __codecvt_utf8_base<char32_t>::
845 do_unshift(state_type&, extern_type* __to, extern_type*,
846 extern_type*& __to_next) const
853 __codecvt_utf8_base<char32_t>::
854 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
855 const extern_type*& __from_next,
856 intern_type* __to, intern_type* __to_end,
857 intern_type*& __to_next) const
859 range<const char> from{ __from, __from_end };
860 range<char32_t> to{ __to, __to_end };
861 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
862 __from_next = from.next;
868 __codecvt_utf8_base<char32_t>::do_encoding() const throw()
872 __codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
876 __codecvt_utf8_base<char32_t>::
877 do_length(state_type&, const extern_type* __from,
878 const extern_type* __end, size_t __max) const
880 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
881 return __end - __from;
885 __codecvt_utf8_base<char32_t>::do_max_length() const throw()
888 #ifdef _GLIBCXX_USE_WCHAR_T
889 // Define members of codecvt_utf8<wchar_t> base class implementation.
890 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
892 __codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
895 __codecvt_utf8_base<wchar_t>::
896 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
897 const intern_type*& __from_next,
898 extern_type* __to, extern_type* __to_end,
899 extern_type*& __to_next) const
901 range<char> to{ __to, __to_end };
902 #if __SIZEOF_WCHAR_T__ == 2
903 range<const char16_t> from{
904 reinterpret_cast<const char16_t*>(__from),
905 reinterpret_cast<const char16_t*>(__from_end)
907 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
908 #elif __SIZEOF_WCHAR_T__ == 4
909 range<const char32_t> from{
910 reinterpret_cast<const char32_t*>(__from),
911 reinterpret_cast<const char32_t*>(__from_end)
913 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
915 return codecvt_base::error;
917 __from_next = reinterpret_cast<const wchar_t*>(from.next);
923 __codecvt_utf8_base<wchar_t>::
924 do_unshift(state_type&, extern_type* __to, extern_type*,
925 extern_type*& __to_next) const
932 __codecvt_utf8_base<wchar_t>::
933 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
934 const extern_type*& __from_next,
935 intern_type* __to, intern_type* __to_end,
936 intern_type*& __to_next) const
938 range<const char> from{ __from, __from_end };
939 #if __SIZEOF_WCHAR_T__ == 2
941 reinterpret_cast<char16_t*>(__to),
942 reinterpret_cast<char16_t*>(__to_end)
944 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
945 #elif __SIZEOF_WCHAR_T__ == 4
947 reinterpret_cast<char32_t*>(__to),
948 reinterpret_cast<char32_t*>(__to_end)
950 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
952 return codecvt_base::error;
954 __from_next = from.next;
955 __to_next = reinterpret_cast<wchar_t*>(to.next);
960 __codecvt_utf8_base<wchar_t>::do_encoding() const throw()
964 __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
968 __codecvt_utf8_base<wchar_t>::
969 do_length(state_type&, const extern_type* __from,
970 const extern_type* __end, size_t __max) const
972 #if __SIZEOF_WCHAR_T__ == 2
973 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
974 #elif __SIZEOF_WCHAR_T__ == 4
975 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
979 return __end - __from;
983 __codecvt_utf8_base<wchar_t>::do_max_length() const throw()
987 // Define members of codecvt_utf16<char16_t> base class implementation.
988 // Converts from UTF-16 to UCS-2.
990 __codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
993 __codecvt_utf16_base<char16_t>::
994 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
995 const intern_type*& __from_next,
996 extern_type* __to, extern_type* __to_end,
997 extern_type*& __to_next) const
999 range<const char16_t> from{ __from, __from_end };
1001 reinterpret_cast<char16_t*>(__to),
1002 reinterpret_cast<char16_t*>(__to_end)
1004 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1005 __from_next = from.next;
1006 __to_next = reinterpret_cast<char*>(to.next);
1010 codecvt_base::result
1011 __codecvt_utf16_base<char16_t>::
1012 do_unshift(state_type&, extern_type* __to, extern_type*,
1013 extern_type*& __to_next) const
1019 codecvt_base::result
1020 __codecvt_utf16_base<char16_t>::
1021 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1022 const extern_type*& __from_next,
1023 intern_type* __to, intern_type* __to_end,
1024 intern_type*& __to_next) const
1026 range<const char16_t> from{
1027 reinterpret_cast<const char16_t*>(__from),
1028 reinterpret_cast<const char16_t*>(__from_end)
1030 range<char16_t> to{ __to, __to_end };
1031 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1032 __from_next = reinterpret_cast<const char*>(from.next);
1033 __to_next = to.next;
1038 __codecvt_utf16_base<char16_t>::do_encoding() const throw()
1042 __codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1046 __codecvt_utf16_base<char16_t>::
1047 do_length(state_type&, const extern_type* __from,
1048 const extern_type* __end, size_t __max) const
1050 auto next = reinterpret_cast<const char16_t*>(__from);
1051 next = ucs2_span(next, reinterpret_cast<const char16_t*>(__end), __max,
1052 _M_maxcode, _M_mode);
1053 return reinterpret_cast<const char*>(next) - __from;
1057 __codecvt_utf16_base<char16_t>::do_max_length() const throw()
1060 // Define members of codecvt_utf16<char32_t> base class implementation.
1061 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1063 __codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1065 codecvt_base::result
1066 __codecvt_utf16_base<char32_t>::
1067 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1068 const intern_type*& __from_next,
1069 extern_type* __to, extern_type* __to_end,
1070 extern_type*& __to_next) const
1072 range<const char32_t> from{ __from, __from_end };
1074 reinterpret_cast<char16_t*>(__to),
1075 reinterpret_cast<char16_t*>(__to_end)
1077 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1078 __from_next = from.next;
1079 __to_next = reinterpret_cast<char*>(to.next);
1083 codecvt_base::result
1084 __codecvt_utf16_base<char32_t>::
1085 do_unshift(state_type&, extern_type* __to, extern_type*,
1086 extern_type*& __to_next) const
1092 codecvt_base::result
1093 __codecvt_utf16_base<char32_t>::
1094 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1095 const extern_type*& __from_next,
1096 intern_type* __to, intern_type* __to_end,
1097 intern_type*& __to_next) const
1099 range<const char16_t> from{
1100 reinterpret_cast<const char16_t*>(__from),
1101 reinterpret_cast<const char16_t*>(__from_end)
1103 range<char32_t> to{ __to, __to_end };
1104 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1105 __from_next = reinterpret_cast<const char*>(from.next);
1106 __to_next = to.next;
1111 __codecvt_utf16_base<char32_t>::do_encoding() const throw()
1115 __codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1119 __codecvt_utf16_base<char32_t>::
1120 do_length(state_type&, const extern_type* __from,
1121 const extern_type* __end, size_t __max) const
1123 auto next = reinterpret_cast<const char16_t*>(__from);
1124 next = ucs4_span(next, reinterpret_cast<const char16_t*>(__end), __max,
1125 _M_maxcode, _M_mode);
1126 return reinterpret_cast<const char*>(next) - __from;
1130 __codecvt_utf16_base<char32_t>::do_max_length() const throw()
1133 #ifdef _GLIBCXX_USE_WCHAR_T
1134 // Define members of codecvt_utf16<wchar_t> base class implementation.
1135 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1137 __codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1139 codecvt_base::result
1140 __codecvt_utf16_base<wchar_t>::
1141 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1142 const intern_type*& __from_next,
1143 extern_type* __to, extern_type* __to_end,
1144 extern_type*& __to_next) const
1146 range<char> to{ __to, __to_end };
1147 #if __SIZEOF_WCHAR_T__ == 2
1148 range<const char16_t> from{
1149 reinterpret_cast<const char16_t*>(__from),
1150 reinterpret_cast<const char16_t*>(__from_end)
1152 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1153 #elif __SIZEOF_WCHAR_T__ == 4
1154 range<const char32_t> from{
1155 reinterpret_cast<const char32_t*>(__from),
1156 reinterpret_cast<const char32_t*>(__from_end)
1158 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1160 return codecvt_base::error;
1162 __from_next = reinterpret_cast<const wchar_t*>(from.next);
1163 __to_next = to.next;
1167 codecvt_base::result
1168 __codecvt_utf16_base<wchar_t>::
1169 do_unshift(state_type&, extern_type* __to, extern_type*,
1170 extern_type*& __to_next) const
1176 codecvt_base::result
1177 __codecvt_utf16_base<wchar_t>::
1178 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1179 const extern_type*& __from_next,
1180 intern_type* __to, intern_type* __to_end,
1181 intern_type*& __to_next) const
1183 range<const char> from{ __from, __from_end };
1184 #if __SIZEOF_WCHAR_T__ == 2
1186 reinterpret_cast<char16_t*>(__to),
1187 reinterpret_cast<char16_t*>(__to_end)
1189 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1190 #elif __SIZEOF_WCHAR_T__ == 4
1192 reinterpret_cast<char32_t*>(__to),
1193 reinterpret_cast<char32_t*>(__to_end)
1195 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1197 return codecvt_base::error;
1199 __from_next = from.next;
1200 __to_next = reinterpret_cast<wchar_t*>(to.next);
1205 __codecvt_utf16_base<wchar_t>::do_encoding() const throw()
1209 __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1213 __codecvt_utf16_base<wchar_t>::
1214 do_length(state_type&, const extern_type* __from,
1215 const extern_type* __end, size_t __max) const
1217 auto next = reinterpret_cast<const char16_t*>(__from);
1218 #if __SIZEOF_WCHAR_T__ == 2
1219 next = ucs2_span(next, reinterpret_cast<const char16_t*>(__end), __max,
1220 _M_maxcode, _M_mode);
1221 #elif __SIZEOF_WCHAR_T__ == 4
1222 next = ucs4_span(next, reinterpret_cast<const char16_t*>(__end), __max,
1223 _M_maxcode, _M_mode);
1225 return reinterpret_cast<const char*>(next) - __from;
1229 __codecvt_utf16_base<wchar_t>::do_max_length() const throw()
1233 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1234 // Converts from UTF-8 to UTF-16.
1236 __codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1238 codecvt_base::result
1239 __codecvt_utf8_utf16_base<char16_t>::
1240 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1241 const intern_type*& __from_next,
1242 extern_type* __to, extern_type* __to_end,
1243 extern_type*& __to_next) const
1245 range<const char16_t> from{ __from, __from_end };
1246 range<char> to{ __to, __to_end };
1247 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1248 __from_next = from.next;
1249 __to_next = to.next;
1253 codecvt_base::result
1254 __codecvt_utf8_utf16_base<char16_t>::
1255 do_unshift(state_type&, extern_type* __to, extern_type*,
1256 extern_type*& __to_next) const
1262 codecvt_base::result
1263 __codecvt_utf8_utf16_base<char16_t>::
1264 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1265 const extern_type*& __from_next,
1266 intern_type* __to, intern_type* __to_end,
1267 intern_type*& __to_next) const
1269 range<const char> from{ __from, __from_end };
1270 range<char16_t> to{ __to, __to_end };
1271 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1272 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1273 mode = codecvt_mode(mode | little_endian);
1275 auto res = utf16_in(from, to, _M_maxcode, mode);
1276 __from_next = from.next;
1277 __to_next = to.next;
1282 __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
1286 __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1290 __codecvt_utf8_utf16_base<char16_t>::
1291 do_length(state_type&, const extern_type* __from,
1292 const extern_type* __end, size_t __max) const
1294 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1295 return __end - __from;
1299 __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1301 // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
1302 // whereas 4 byte sequences require two 16-bit code units.
1306 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1307 // Converts from UTF-8 to UTF-16.
1309 __codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1311 codecvt_base::result
1312 __codecvt_utf8_utf16_base<char32_t>::
1313 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1314 const intern_type*& __from_next,
1315 extern_type* __to, extern_type* __to_end,
1316 extern_type*& __to_next) const
1318 range<const char32_t> from{ __from, __from_end };
1319 range<char> to{ __to, __to_end };
1320 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1321 __from_next = from.next;
1322 __to_next = to.next;
1326 codecvt_base::result
1327 __codecvt_utf8_utf16_base<char32_t>::
1328 do_unshift(state_type&, extern_type* __to, extern_type*,
1329 extern_type*& __to_next) const
1335 codecvt_base::result
1336 __codecvt_utf8_utf16_base<char32_t>::
1337 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1338 const extern_type*& __from_next,
1339 intern_type* __to, intern_type* __to_end,
1340 intern_type*& __to_next) const
1342 range<const char> from{ __from, __from_end };
1343 range<char32_t> to{ __to, __to_end };
1344 auto res = utf16_in(from, to, _M_maxcode, _M_mode);
1345 __from_next = from.next;
1346 __to_next = to.next;
1351 __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
1355 __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1359 __codecvt_utf8_utf16_base<char32_t>::
1360 do_length(state_type&, const extern_type* __from,
1361 const extern_type* __end, size_t __max) const
1363 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1364 return __end - __from;
1368 __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1370 // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
1371 // whereas 4 byte sequences require two 16-bit code units.
1375 #ifdef _GLIBCXX_USE_WCHAR_T
1376 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1377 // Converts from UTF-8 to UTF-16.
1379 __codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1381 codecvt_base::result
1382 __codecvt_utf8_utf16_base<wchar_t>::
1383 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1384 const intern_type*& __from_next,
1385 extern_type* __to, extern_type* __to_end,
1386 extern_type*& __to_next) const
1388 range<const wchar_t> from{ __from, __from_end };
1389 range<char> to{ __to, __to_end };
1390 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1391 __from_next = from.next;
1392 __to_next = to.next;
1396 codecvt_base::result
1397 __codecvt_utf8_utf16_base<wchar_t>::
1398 do_unshift(state_type&, extern_type* __to, extern_type*,
1399 extern_type*& __to_next) const
1405 codecvt_base::result
1406 __codecvt_utf8_utf16_base<wchar_t>::
1407 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1408 const extern_type*& __from_next,
1409 intern_type* __to, intern_type* __to_end,
1410 intern_type*& __to_next) const
1412 range<const char> from{ __from, __from_end };
1413 range<wchar_t> to{ __to, __to_end };
1414 auto res = utf16_in(from, to, _M_maxcode, _M_mode);
1415 __from_next = from.next;
1416 __to_next = to.next;
1421 __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
1425 __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1429 __codecvt_utf8_utf16_base<wchar_t>::
1430 do_length(state_type&, const extern_type* __from,
1431 const extern_type* __end, size_t __max) const
1433 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1434 return __end - __from;
1438 __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1440 // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
1441 // whereas 4 byte sequences require two 16-bit code units.
1446 inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1447 inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
1448 template class codecvt_byname<char16_t, char, mbstate_t>;
1449 template class codecvt_byname<char32_t, char, mbstate_t>;
1451 _GLIBCXX_END_NAMESPACE_VERSION
1453 #endif // _GLIBCXX_USE_C99_STDINT_TR1