2 * Copyright (c) 2003, 2005 Ryuichiro Imura
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $FreeBSD: head/sys/libkern/iconv_ucs.c 267291 2014-06-09 19:27:47Z jhb $
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/systm.h>
32 #include <sys/malloc.h>
33 #include <sys/iconv.h>
35 #include "iconv_converter_if.h"
41 #define KICONV_UCS_COMBINE 0x1
42 #define KICONV_UCS_FROM_UTF8 0x2
43 #define KICONV_UCS_TO_UTF8 0x4
44 #define KICONV_UCS_FROM_LE 0x8
45 #define KICONV_UCS_TO_LE 0x10
46 #define KICONV_UCS_FROM_UTF16 0x20
47 #define KICONV_UCS_TO_UTF16 0x40
48 #define KICONV_UCS_UCS4 0x80
50 #define ENCODING_UTF16 "UTF-16BE"
51 #define ENCODING_UTF8 "UTF-8"
55 int from_flag, to_flag;
56 } unicode_family[] = {
57 { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 },
58 { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE },
59 { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 },
60 { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
61 KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
65 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
66 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
67 static uint32_t encode_surrogate(uint32_t code);
68 static uint32_t decode_surrogate(const u_char *ucs);
71 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
75 * UCS converter instance
80 struct iconv_cspair * d_csp;
81 struct iconv_cspair * d_cspf;
88 iconv_ucs_open(struct iconv_converter_class *dcp,
89 struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
93 const char *from, *to;
95 dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
97 from = cspf ? cspf->cp_from : csp->cp_from;
102 dp->convtype |= KICONV_UCS_COMBINE;
103 for (i = 0; unicode_family[i].name; i++) {
104 if (strcasecmp(from, unicode_family[i].name) == 0)
105 dp->convtype |= unicode_family[i].from_flag;
106 if (strcasecmp(to, unicode_family[i].name) == 0)
107 dp->convtype |= unicode_family[i].to_flag;
109 if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
110 dp->convtype |= KICONV_UCS_UCS4;
112 dp->convtype &= ~KICONV_UCS_UCS4;
114 dp->f_ctp = dp->t_ctp = NULL;
115 if (dp->convtype & KICONV_UCS_COMBINE) {
116 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
117 (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
118 iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
120 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
121 (dp->convtype & KICONV_UCS_TO_LE) == 0) {
122 iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
127 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
128 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
131 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
138 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
145 iconv_ucs_close(void *data)
147 struct iconv_ucs *dp = data;
150 iconv_close(dp->f_ctp);
152 iconv_close(dp->t_ctp);
154 iconv_close(dp->ctype);
156 dp->d_cspf->cp_refcount--;
157 else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
158 dp->d_csp->cp_refcount--;
159 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
160 dp->d_csp->cp_refcount--;
161 kobj_delete((struct kobj*)data, M_ICONV);
166 iconv_ucs_conv(void *d2p, const char **inbuf,
167 size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
168 int convchar, int casetype)
170 struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
172 size_t in, on, ir, or, inlen, outlen, ucslen;
178 if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
180 ir = in = *inbytesleft;
181 or = on = *outbytesleft;
185 while (ir > 0 && or > 0) {
188 * The first half of conversion.
189 * (convert any code into ENCODING_UNICODE)
193 if (dp->convtype & KICONV_UCS_FROM_UTF8) {
194 /* convert UTF-8 to ENCODING_UNICODE */
196 code = utf8_to_ucs4(p, &inlen, ir);
202 if (casetype == KICONV_FROM_LOWER && dp->ctype) {
203 code = towlower(code, dp->ctype);
204 } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
205 code = towupper(code, dp->ctype);
208 if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
209 /* reserved for utf-16 surrogate pair */
210 /* invalid unicode */
216 if (dp->convtype & KICONV_UCS_UCS4) {
218 code = encode_surrogate(code);
220 /* can't handle with ucs-2 */
228 /* save UCS-4 into ucs[] */
229 for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
230 *q++ = (code >> (i << 3)) & 0xff;
232 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
233 /* convert local code to ENCODING_UNICODE */
237 ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
238 &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
245 /* src code is a proper subset of ENCODING_UNICODE */
247 if (dp->convtype & KICONV_UCS_FROM_LE) {
255 if ((*q & 0xfc) == 0xd8) {
256 if (dp->convtype & KICONV_UCS_UCS4 &&
257 dp->convtype & KICONV_UCS_FROM_UTF16) {
260 /* invalid unicode */
273 if (dp->convtype & KICONV_UCS_FROM_LE) {
280 if ((*q & 0xfc) != 0xdc) {
281 /* invalid unicode */
289 * The second half of conversion.
290 * (convert ENCODING_UNICODE into any code)
293 if (dp->convtype & KICONV_UCS_TO_UTF8) {
295 if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
296 /* decode surrogate pair */
297 code = decode_surrogate(p);
299 code = (ucs[0] << 8) | ucs[1];
302 if (casetype == KICONV_LOWER && dp->ctype) {
303 code = towlower(code, dp->ctype);
304 } else if (casetype == KICONV_UPPER && dp->ctype) {
305 code = towupper(code, dp->ctype);
309 if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
319 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
320 ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
321 &or, casetype & (KICONV_LOWER | KICONV_UPPER));
329 /* dst code is a proper subset of ENCODING_UNICODE */
337 if (dp->convtype & KICONV_UCS_TO_LE) {
346 if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
347 (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
351 if (dp->convtype & KICONV_UCS_TO_LE) {
367 *inbytesleft -= in - ir;
368 *outbytesleft -= on - or;
373 iconv_ucs_init(struct iconv_converter_class *dcp)
377 error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
380 error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
387 iconv_ucs_done(struct iconv_converter_class *dcp)
393 iconv_ucs_name(struct iconv_converter_class *dcp)
395 return (ENCODING_UNICODE);
398 static kobj_method_t iconv_ucs_methods[] = {
399 KOBJMETHOD(iconv_converter_open, iconv_ucs_open),
400 KOBJMETHOD(iconv_converter_close, iconv_ucs_close),
401 KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv),
402 KOBJMETHOD(iconv_converter_init, iconv_ucs_init),
403 KOBJMETHOD(iconv_converter_done, iconv_ucs_done),
404 KOBJMETHOD(iconv_converter_name, iconv_ucs_name),
408 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
411 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
417 * get leading 1 byte from utf-8
419 if ((*src & 0x80) == 0) {
421 * leading 1 bit is "0"
423 * ucs-4: 00000000 00000000 00000000 0xxxxxxx
426 /* get trailing 7 bits */
428 } else if ((*src & 0xe0) == 0xc0) {
430 * leading 3 bits are "110"
431 * utf-8: 110xxxxx 10yyyyyy
432 * ucs-4: 00000000 00000000 00000xxx xxyyyyyy
435 /* get trailing 5 bits */
437 } else if ((*src & 0xf0) == 0xe0) {
439 * leading 4 bits are "1110"
440 * utf-8: 1110xxxx 10yyyyyy 10zzzzzz
441 * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
444 /* get trailing 4 bits */
446 } else if ((*src & 0xf8) == 0xf0) {
448 * leading 5 bits are "11110"
449 * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
450 * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
453 /* get trailing 3 bits */
456 /* out of utf-16 range or having illegal bits */
464 * get left parts from utf-8
466 for (i = 1 ; i < w ; i++) {
467 if ((*(src + i) & 0xc0) != 0x80) {
468 /* invalid: leading 2 bits are not "10" */
471 /* concatenate trailing 6 bits into ucs4 */
473 ucs4 |= *(src + i) & 0x3f;
481 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
487 * determine utf-8 width and leading bits
492 } else if (ucs4 < 0x800) {
494 lead = 0xc0; /* "11" */
495 } else if (ucs4 < 0x10000) {
497 lead = 0xe0; /* "111" */
498 } else if (ucs4 < 0x200000) {
500 lead = 0xf0; /* "1111" */
512 for (i = w - 1 ; i >= 1 ; i--) {
513 /* get trailing 6 bits and put it with leading bit as "1" */
514 *(p + i) = (ucs4 & 0x3f) | 0x80;
525 encode_surrogate(register uint32_t code)
527 return ((((code - 0x10000) << 6) & 0x3ff0000) |
528 ((code - 0x10000) & 0x3ff) | 0xd800dc00);
532 decode_surrogate(register const u_char *ucs)
534 return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
535 ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);