lib/libc/locale/utf8.c

   1 /*
   2  * Copyright 2015 Matthew Dillon <dillon@backplane.com> (mbintowcr, wcrtombin)
   3  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
   4  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
   5  * Copyright (c) 2002-2004 Tim J. Robbins
   6  * All rights reserved.
   7  *
   8  * Copyright (c) 2011 The FreeBSD Foundation
   9  * All rights reserved.
  10  * Portions of this software were developed by David Chisnall
  11  * under sponsorship from the FreeBSD Foundation.
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 /*
  36  * WCSBIN_EOF -         Indicate EOF on input buffer.
  37  *
  38  * WCSBIN_SURRO -       Pass-through surrogate space (typically if the UTF-8
  39  *                      has already been escaped), on bytes-to-wchars and
  40  *                      wchars-to-bytes.  Escaping of other illegal codes will
  41  *                      still occur on input but de-escaping will not occur
  42  *                      on output (they will remain in the surrogate space).
  43  *
  44  * WCSBIN_LONGCODES -   Allow 4-byte >= 0x10FFFF, 5-byte and 6-byte sequences
  45  *                      (normally illegal), otherwise escape it on input
  46  *                      and fail on output.
  47  *
  48  * WCSBIN_STRICT -      Allow byte-to-wide conversions to fail.
  49  */
  50
  51 #include <sys/param.h>
  52
  53 #include <errno.h>
  54 #include <limits.h>
  55 #include <runetype.h>
  56 #include <stdlib.h>
  57 #include <string.h>
  58 #include <wchar.h>
  59 #include "mblocal.h"
  60
  61 extern int __mb_sb_limit;
  62
  63 static size_t   _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
  64                     size_t, mbstate_t * __restrict);
  65 static int      _UTF8_mbsinit(const mbstate_t *);
  66 static size_t   _UTF8_mbsnrtowcs(wchar_t * __restrict,
  67                     const char ** __restrict, size_t, size_t,
  68                     mbstate_t * __restrict);
  69 static size_t   _UTF8_wcrtomb(char * __restrict, wchar_t,
  70                     mbstate_t * __restrict);
  71 static size_t   _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
  72                     size_t, size_t, mbstate_t * __restrict);
  73 static size_t   _UTF8_mbintowcr(wchar_t * __restrict dst,
  74                     const char * __restrict src,
  75                     size_t dlen, size_t *slen, int flags);
  76 static size_t   _UTF8_wcrtombin(char * __restrict dst,
  77                     const wchar_t * __restrict src,
  78                     size_t dlen, size_t *slen, int flags);
  79
  80 typedef struct {
  81         wchar_t ch;
  82         int     want;
  83         wchar_t lbound;
  84 } _UTF8State;
  85
  86 int
  87 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
  88 {
  89
  90         l->__mbrtowc = _UTF8_mbrtowc;
  91         l->__wcrtomb = _UTF8_wcrtomb;
  92         l->__mbsinit = _UTF8_mbsinit;
  93         l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
  94         l->__wcsnrtombs = _UTF8_wcsnrtombs;
  95         l->__mbintowcr = _UTF8_mbintowcr;
  96         l->__wcrtombin = _UTF8_wcrtombin;
  97         l->runes = rl;
  98         l->__mb_cur_max = 4;
  99         /*
 100          * UCS-4 encoding used as the internal representation, so
 101          * slots 0x0080-0x00FF are occuped and must be excluded
 102          * from the single byte ctype by setting the limit.
 103          */
 104         l->__mb_sb_limit = 128;
 105
 106         return (0);
 107 }
 108
 109 static int
 110 _UTF8_mbsinit(const mbstate_t *ps)
 111 {
 112
 113         return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
 114 }
 115
 116 static size_t
 117 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
 118     mbstate_t * __restrict ps)
 119 {
 120         _UTF8State *us;
 121         int ch, i, mask, want;
 122         wchar_t lbound, wch;
 123
 124         us = (_UTF8State *)ps;
 125
 126         if (us->want < 0 || us->want > 4) {
 127                 errno = EINVAL;
 128                 return ((size_t)-1);
 129         }
 130
 131         if (s == NULL) {
 132                 s = "";
 133                 n = 1;
 134                 pwc = NULL;
 135         }
 136
 137         if (n == 0)
 138                 /* Incomplete multibyte sequence */
 139                 return ((size_t)-2);
 140
 141         if (us->want == 0) {
 142                 /*
 143                  * Determine the number of octets that make up this character
 144                  * from the first octet, and a mask that extracts the
 145                  * interesting bits of the first octet. We already know
 146                  * the character is at least two bytes long.
 147                  *
 148                  * We also specify a lower bound for the character code to
 149                  * detect redundant, non-"shortest form" encodings. For
 150                  * example, the sequence C0 80 is _not_ a legal representation
 151                  * of the null character. This enforces a 1-to-1 mapping
 152                  * between character codes and their multibyte representations.
 153                  */
 154                 ch = (unsigned char)*s;
 155                 if ((ch & 0x80) == 0) {
 156                         /* Fast path for plain ASCII characters. */
 157                         if (pwc != NULL)
 158                                 *pwc = ch;
 159                         return (ch != '\0' ? 1 : 0);
 160                 }
 161                 if ((ch & 0xe0) == 0xc0) {
 162                         mask = 0x1f;
 163                         want = 2;
 164                         lbound = 0x80;
 165                 } else if ((ch & 0xf0) == 0xe0) {
 166                         mask = 0x0f;
 167                         want = 3;
 168                         lbound = 0x800;
 169                 } else if ((ch & 0xf8) == 0xf0) {
 170                         mask = 0x07;
 171                         want = 4;
 172                         lbound = 0x10000;
 173                 } else {
 174                         /*
 175                          * Malformed input; input is not UTF-8.
 176                          */
 177                         errno = EILSEQ;
 178                         return ((size_t)-1);
 179                 }
 180         } else {
 181                 want = us->want;
 182                 lbound = us->lbound;
 183         }
 184
 185         /*
 186          * Decode the octet sequence representing the character in chunks
 187          * of 6 bits, most significant first.
 188          */
 189         if (us->want == 0)
 190                 wch = (unsigned char)*s++ & mask;
 191         else
 192                 wch = us->ch;
 193
 194         for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
 195                 if ((*s & 0xc0) != 0x80) {
 196                         /*
 197                          * Malformed input; bad characters in the middle
 198                          * of a character.
 199                          */
 200                         errno = EILSEQ;
 201                         return ((size_t)-1);
 202                 }
 203                 wch <<= 6;
 204                 wch |= *s++ & 0x3f;
 205         }
 206         if (i < want) {
 207                 /* Incomplete multibyte sequence. */
 208                 us->want = want - i;
 209                 us->lbound = lbound;
 210                 us->ch = wch;
 211                 return ((size_t)-2);
 212         }
 213         if (wch < lbound || (wch & ~0x10ffff)) {
 214                 /*
 215                  * Malformed input; redundant encoding or illegal
 216                  *                  code sequence.
 217                  */
 218                 errno = EILSEQ;
 219                 return ((size_t)-1);
 220         }
 221         if (pwc != NULL)
 222                 *pwc = wch;
 223         us->want = 0;
 224         return (wch == L'\0' ? 0 : want);
 225 }
 226
 227 static size_t
 228 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
 229     size_t nms, size_t len, mbstate_t * __restrict ps)
 230 {
 231         _UTF8State *us;
 232         const char *s;
 233         size_t nchr;
 234         wchar_t wc;
 235         size_t nb;
 236
 237         us = (_UTF8State *)ps;
 238
 239         s = *src;
 240         nchr = 0;
 241
 242         if (dst == NULL) {
 243                 /*
 244                  * The fast path in the loop below is not safe if an ASCII
 245                  * character appears as anything but the first byte of a
 246                  * multibyte sequence. Check now to avoid doing it in the loop.
 247                  */
 248                 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
 249                         errno = EILSEQ;
 250                         return ((size_t)-1);
 251                 }
 252                 for (;;) {
 253                         if (nms > 0 && (signed char)*s > 0)
 254                                 /*
 255                                  * Fast path for plain ASCII characters
 256                                  * excluding NUL.
 257                                  */
 258                                 nb = 1;
 259                         else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
 260                             (size_t)-1)
 261                                 /* Invalid sequence - mbrtowc() sets errno. */
 262                                 return ((size_t)-1);
 263                         else if (nb == 0 || nb == (size_t)-2)
 264                                 return (nchr);
 265                         s += nb;
 266                         nms -= nb;
 267                         nchr++;
 268                 }
 269                 /*NOTREACHED*/
 270         }
 271
 272         /*
 273          * The fast path in the loop below is not safe if an ASCII
 274          * character appears as anything but the first byte of a
 275          * multibyte sequence. Check now to avoid doing it in the loop.
 276          */
 277         if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
 278                 errno = EILSEQ;
 279                 return ((size_t)-1);
 280         }
 281         while (len-- > 0) {
 282                 if (nms > 0 && (signed char)*s > 0) {
 283                         /*
 284                          * Fast path for plain ASCII characters
 285                          * excluding NUL.
 286                          */
 287                         *dst = (wchar_t)*s;
 288                         nb = 1;
 289                 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
 290                     (size_t)-1) {
 291                         *src = s;
 292                         return ((size_t)-1);
 293                 } else if (nb == (size_t)-2) {
 294                         *src = s + nms;
 295                         return (nchr);
 296                 } else if (nb == 0) {
 297                         *src = NULL;
 298                         return (nchr);
 299                 }
 300                 s += nb;
 301                 nms -= nb;
 302                 nchr++;
 303                 dst++;
 304         }
 305         *src = s;
 306         return (nchr);
 307 }
 308
 309 static size_t
 310 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
 311 {
 312         _UTF8State *us;
 313         unsigned char lead;
 314         int i, len;
 315
 316         us = (_UTF8State *)ps;
 317
 318         if (us->want != 0) {
 319                 errno = EINVAL;
 320                 return ((size_t)-1);
 321         }
 322
 323         if (s == NULL)
 324                 /* Reset to initial shift state (no-op) */
 325                 return (1);
 326
 327         /*
 328          * Determine the number of octets needed to represent this character.
 329          * We always output the shortest sequence possible. Also specify the
 330          * first few bits of the first octet, which contains the information
 331          * about the sequence length.
 332          */
 333         if ((wc & ~0x7f) == 0) {
 334                 /* Fast path for plain ASCII characters. */
 335                 *s = (char)wc;
 336                 return (1);
 337         } else if ((wc & ~0x7ff) == 0) {
 338                 lead = 0xc0;
 339                 len = 2;
 340         } else if ((wc & ~0xffff) == 0) {
 341                 lead = 0xe0;
 342                 len = 3;
 343         } else if ((wc & ~0x10ffff) == 0) {
 344                 lead = 0xf0;
 345                 len = 4;
 346         } else {
 347                 errno = EILSEQ;
 348                 return ((size_t)-1);
 349         }
 350
 351         /*
 352          * Output the octets representing the character in chunks
 353          * of 6 bits, least significant last. The first octet is
 354          * a special case because it contains the sequence length
 355          * information.
 356          */
 357         for (i = len - 1; i > 0; i--) {
 358                 s[i] = (wc & 0x3f) | 0x80;
 359                 wc >>= 6;
 360         }
 361         *s = (wc & 0xff) | lead;
 362
 363         return (len);
 364 }
 365
 366 static size_t
 367 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
 368     size_t nwc, size_t len, mbstate_t * __restrict ps)
 369 {
 370         _UTF8State *us;
 371         char buf[MB_LEN_MAX];
 372         const wchar_t *s;
 373         size_t nbytes;
 374         size_t nb;
 375
 376         us = (_UTF8State *)ps;
 377
 378         if (us->want != 0) {
 379                 errno = EINVAL;
 380                 return ((size_t)-1);
 381         }
 382
 383         s = *src;
 384         nbytes = 0;
 385
 386         if (dst == NULL) {
 387                 while (nwc-- > 0) {
 388                         if (0 <= *s && *s < 0x80)
 389                                 /* Fast path for plain ASCII characters. */
 390                                 nb = 1;
 391                         else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
 392                             (size_t)-1)
 393                                 /* Invalid character - wcrtomb() sets errno. */
 394                                 return ((size_t)-1);
 395                         if (*s == L'\0')
 396                                 return (nbytes + nb - 1);
 397                         s++;
 398                         nbytes += nb;
 399                 }
 400                 return (nbytes);
 401         }
 402
 403         while (len > 0 && nwc-- > 0) {
 404                 if (0 <= *s && *s < 0x80) {
 405                         /* Fast path for plain ASCII characters. */
 406                         nb = 1;
 407                         *dst = *s;
 408                 } else if (len > (size_t)MB_CUR_MAX) {
 409                         /* Enough space to translate in-place. */
 410                         if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
 411                                 *src = s;
 412                                 return ((size_t)-1);
 413                         }
 414                 } else {
 415                         /*
 416                          * May not be enough space; use temp. buffer.
 417                          */
 418                         if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
 419                                 *src = s;
 420                                 return ((size_t)-1);
 421                         }
 422                         if (nb > (int)len)
 423                                 /* MB sequence for character won't fit. */
 424                                 break;
 425                         (void) memcpy(dst, buf, nb);
 426                 }
 427                 if (*s == L'\0') {
 428                         *src = NULL;
 429                         return (nbytes + nb - 1);
 430                 }
 431                 s++;
 432                 dst += nb;
 433                 len -= nb;
 434                 nbytes += nb;
 435         }
 436         *src = s;
 437         return (nbytes);
 438 }
 439
 440 /*
 441  * Clean binary to wchar buffer conversions.  This is basically like a normal
 442  * buffer conversion but with a sane argument API and escaping.  See none.c
 443  * for a more complete description.
 444  */
 445 static size_t
 446 _UTF8_mbintowcr(wchar_t * __restrict dst, const char * __restrict src,
 447                 size_t dlen, size_t *slen, int flags)
 448 {
 449         size_t i;
 450         size_t j;
 451         size_t k;
 452         size_t n = *slen;
 453         int ch, mask, want;
 454         wchar_t lbound, wch;
 455
 456         for (i = j = 0; i < n; ++i) {
 457                 if (j == dlen)
 458                         break;
 459                 ch = (unsigned char)src[i];
 460
 461                 if ((ch & 0x80) == 0) {
 462                         /* Fast path for plain ASCII characters. */
 463                         if (dst)
 464                                 dst[j] = ch;
 465                         ++j;
 466                         continue;
 467                 }
 468                 if ((ch & 0xe0) == 0xc0) {
 469                         mask = 0x1f;
 470                         want = 2;
 471                         lbound = 0x80;
 472                 } else if ((ch & 0xf0) == 0xe0) {
 473                         mask = 0x0f;
 474                         want = 3;
 475                         lbound = 0x800;
 476                 } else if ((ch & 0xf8) == 0xf0) {
 477                         mask = 0x07;
 478                         want = 4;
 479                         lbound = 0x10000;
 480                 } else if ((ch & 0xfc) == 0xf8) {
 481                         /* normally illegal, handled down below */
 482                         mask = 0x03;
 483                         want = 5;
 484                         lbound = 0x200000;
 485                 } else if ((ch & 0xfe) == 0xfc) {
 486                         /* normally illegal, handled down below */
 487                         mask = 0x01;
 488                         want = 6;
 489                         lbound = 0x4000000;
 490                 } else {
 491                         /*
 492                          * Malformed input; input is not UTF-8, escape
 493                          * with UTF-8B.
 494                          */
 495                         if (flags & WCSBIN_STRICT) {
 496                                 if (i == 0) {
 497                                         errno = EILSEQ;
 498                                         return ((size_t)-1);
 499                                 }
 500                                 break;
 501                         }
 502                         if (dst)
 503                                 dst[j] = 0xDC00 | ch;
 504                         ++j;
 505                         continue;
 506                 }
 507
 508                 /*
 509                  * Construct wchar_t from multibyte sequence.
 510                  */
 511                 wch = ch & mask;
 512                 for (k = 1; k < want; ++k) {
 513                         /*
 514                          * Stop if not enough input (don't do this early
 515                          * so we can detect illegal characters as they occur
 516                          * in the stream).
 517                          *
 518                          * If termination is requested force-escape all chars.
 519                          */
 520                         if (i + k >= n) {
 521                                 if (flags & WCSBIN_EOF) {
 522                                         want = n - i;
 523                                         goto forceesc;
 524                                 }
 525                                 goto breakout;
 526                         }
 527
 528                         ch = src[i+k];
 529                         if ((ch & 0xc0) != 0x80) {
 530                                 /*
 531                                  * Malformed input, bad characters in the
 532                                  * middle of a multibyte sequence.  Escape
 533                                  * with UTF-8B.
 534                                  */
 535                                 if (flags & WCSBIN_STRICT) {
 536                                         if (i == 0) {
 537                                                 errno = EILSEQ;
 538                                                 return ((size_t)-1);
 539                                         }
 540                                         goto breakout;
 541                                 }
 542                                 if (dst)
 543                                         dst[j] = 0xDC00 | (unsigned char)src[i];
 544                                 ++j;
 545                                 goto loopup;
 546                         }
 547                         wch <<= 6;
 548                         wch |= ch & 0x3f;
 549                 }
 550
 551                 /*
 552                  * Check validity of the wchar.  If invalid we could escape
 553                  * just the first character and loop up, but it ought to be
 554                  * more readable if we escape all the chars in the sequence
 555                  * (since they are all >= 0x80 and might represent a legacy
 556                  * 5-byte or 6-byte code).
 557                  */
 558                 if (wch < lbound ||
 559                     ((flags & WCSBIN_LONGCODES) == 0 && (wch & ~0x10ffff)) ||
 560                     ((flags & WCSBIN_LONGCODES) == 0 && want >= 5)) {
 561                         goto forceesc;
 562                 }
 563
 564                 /*
 565                  * Check if wch is a surrogate code (which also encloses our
 566                  * UTF-8B escaping range).  This is normally illegal in UTF8.
 567                  * If it is, we need to escape each characer in the sequence.
 568                  * Breakout if there isn't enough output buffer space.
 569                  *
 570                  * If (flags & WCSBIN_SURRO) the caller wishes to accept
 571                  * surrogate codes, i.e. the input might potentially already
 572                  * be escaped UTF8-B or unchecked UTF-16 that was converted
 573                  * into UTF-8.
 574                  */
 575                 if ((flags & WCSBIN_SURRO) == 0 &&
 576                     wch >= 0xD800 && wch <= 0xDFFF) {
 577 forceesc:
 578                         if (j + want > dlen)
 579                                 break;
 580                         if (flags & WCSBIN_STRICT) {
 581                                 if (i == 0) {
 582                                         errno = EILSEQ;
 583                                         return ((size_t)-1);
 584                                 }
 585                                 break;
 586                         }
 587                         for (k = 0; k < want; ++k) {
 588                                 if (dst) {
 589                                         dst[j] = 0xDC00 |
 590                                                  (unsigned char)src[i+k];
 591                                 }
 592                                 ++j;
 593                         }
 594                         i += k - 1;
 595                 } else {
 596                         i += k - 1;
 597                         if (dst)
 598                                 dst[j] = wch;
 599                         ++j;
 600                 }
 601 loopup:
 602                 ;
 603         }
 604 breakout:
 605         *slen = i;
 606
 607         return j;
 608 }
 609
 610 static size_t
 611 _UTF8_wcrtombin(char * __restrict dst, const wchar_t * __restrict src,
 612                 size_t dlen, size_t *slen, int flags)
 613 {
 614         size_t i;
 615         size_t j;
 616         size_t k;
 617         size_t n = *slen;
 618         size_t len;
 619         unsigned char lead;
 620         wchar_t wc;
 621
 622         for (i = j = 0; i < n; ++i) {
 623                 if (j == dlen)
 624                         break;
 625                 wc = src[i];
 626
 627                 if ((wc & ~0x7f) == 0) {
 628                         /* Fast path for plain ASCII characters. */
 629                         if (dst)
 630                                 dst[j] = (unsigned char)wc;
 631                         ++j;
 632                         continue;
 633                 }
 634                 if ((wc & ~0x7ff) == 0) {
 635                         lead = 0xc0;
 636                         len = 2;
 637                 } else if (wc >= 0xDC80 && wc <= 0xDCFF &&
 638                            (flags & WCSBIN_SURRO) == 0) {
 639                         if (flags & WCSBIN_STRICT) {
 640                                 /*
 641                                  * STRICT without SURRO is an error for
 642                                  * surrogates.
 643                                  */
 644                                 if (i == 0) {
 645                                         errno = EILSEQ;
 646                                         return ((size_t)-1);
 647                                 }
 648                                 break;
 649                         }
 650                         if (dst)
 651                                 dst[j] = (unsigned char)wc;
 652                         ++j;
 653                         continue;
 654                 } else if ((wc & ~0xffff) == 0) {
 655                         if (wc >= 0xD800 && wc <= 0xDFFF &&
 656                             (flags & (WCSBIN_SURRO | WCSBIN_STRICT)) ==
 657                             WCSBIN_STRICT) {
 658                                 /*
 659                                  * Surrogates in general are an error
 660                                  * if STRICT is specified and SURRO is not
 661                                  * specified.
 662                                  */
 663                                 if (i == 0) {
 664                                         errno = EILSEQ;
 665                                         return ((size_t)-1);
 666                                 }
 667                                 break;
 668                         }
 669                         lead = 0xe0;
 670                         len = 3;
 671                 } else if ((wc & ~0x10ffff) == 0) {
 672                         lead = 0xf0;
 673                         len = 4;
 674                 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x200000) {
 675                         /* normally illegal */
 676                         lead = 0xf0;
 677                         len = 4;
 678                 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x4000000) {
 679                         /* normally illegal */
 680                         lead = 0xf8;
 681                         len = 5;
 682                 } else if ((flags & WCSBIN_LONGCODES) &&
 683                            (uint32_t)wc < 0x80000000U) {
 684                         /* normally illegal */
 685                         lead = 0xfc;
 686                         len = 6;
 687                 } else {
 688                         if (i == 0) {
 689                                 errno = EILSEQ;
 690                                 return ((size_t)-1);
 691                         }
 692                         /* stop here, process error on next loop */
 693                         break;
 694                 }
 695
 696                 /*
 697                  * Output the octets representing the character in chunks
 698                  * of 6 bits, least significant last. The first octet is
 699                  * a special case because it contains the sequence length
 700                  * information.
 701                  */
 702                 if (j + len > dlen)
 703                         break;
 704                 k = j;
 705                 j += len;
 706                 if (dst) {
 707                         while (--len > 0) {
 708                                 dst[k + len] = (wc & 0x3f) | 0x80;
 709                                 wc >>= 6;
 710                         }
 711                         dst[k] = (wc & 0xff) | lead;
 712                 }
 713         }
 714         *slen = i;
 715
 716         return j;
 717 }
 718
 719 size_t
 720 utf8towcr(wchar_t * __restrict dst, const char * __restrict src,
 721                 size_t dlen, size_t *slen, int flags)
 722 {
 723         return _UTF8_mbintowcr(dst, src, dlen, slen, flags);
 724 }
 725
 726 size_t
 727 wcrtoutf8(char * __restrict dst, const wchar_t * __restrict src,
 728           size_t dlen, size_t *slen, int flags)
 729 {
 730         return _UTF8_wcrtombin(dst, src, dlen, slen, flags);
 731 }