1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2003, 2005 Ryuichiro Imura
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD: releng/12.0/sys/libkern/iconv_ucs.c 326271 2017-11-27 15:20:12Z pfg $");
31
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
36 #include <sys/iconv.h>
37
38 #include "iconv_converter_if.h"
39
40 /*
41 * "UCS" converter
42 */
43
44 #define KICONV_UCS_COMBINE 0x1
45 #define KICONV_UCS_FROM_UTF8 0x2
46 #define KICONV_UCS_TO_UTF8 0x4
47 #define KICONV_UCS_FROM_LE 0x8
48 #define KICONV_UCS_TO_LE 0x10
49 #define KICONV_UCS_FROM_UTF16 0x20
50 #define KICONV_UCS_TO_UTF16 0x40
51 #define KICONV_UCS_UCS4 0x80
52
53 #define ENCODING_UTF16 "UTF-16BE"
54 #define ENCODING_UTF8 "UTF-8"
55
56 static struct {
57 const char *name;
58 int from_flag, to_flag;
59 } unicode_family[] = {
60 { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 },
61 { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE },
62 { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 },
63 { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
64 KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
65 { NULL, 0, 0 }
66 };
67
68 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
69 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
70 static uint32_t encode_surrogate(uint32_t code);
71 static uint32_t decode_surrogate(const u_char *ucs);
72
73 #ifdef MODULE_DEPEND
74 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
75 #endif
76
77 /*
78 * UCS converter instance
79 */
80 struct iconv_ucs {
81 KOBJ_FIELDS;
82 int convtype;
83 struct iconv_cspair * d_csp;
84 struct iconv_cspair * d_cspf;
85 void * f_ctp;
86 void * t_ctp;
87 void * ctype;
88 };
89
90 static int
91 iconv_ucs_open(struct iconv_converter_class *dcp,
92 struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
93 {
94 struct iconv_ucs *dp;
95 int i;
96 const char *from, *to;
97
98 dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
99 to = csp->cp_to;
100 from = cspf ? cspf->cp_from : csp->cp_from;
101
102 dp->convtype = 0;
103
104 if (cspf)
105 dp->convtype |= KICONV_UCS_COMBINE;
106 for (i = 0; unicode_family[i].name; i++) {
107 if (strcasecmp(from, unicode_family[i].name) == 0)
108 dp->convtype |= unicode_family[i].from_flag;
109 if (strcasecmp(to, unicode_family[i].name) == 0)
110 dp->convtype |= unicode_family[i].to_flag;
111 }
112 if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
113 dp->convtype |= KICONV_UCS_UCS4;
114 else
115 dp->convtype &= ~KICONV_UCS_UCS4;
116
117 dp->f_ctp = dp->t_ctp = NULL;
118 if (dp->convtype & KICONV_UCS_COMBINE) {
119 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
120 (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
121 iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
122 }
123 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
124 (dp->convtype & KICONV_UCS_TO_LE) == 0) {
125 iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
126 }
127 }
128
129 dp->ctype = NULL;
130 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
131 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
132
133 dp->d_csp = csp;
134 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
135 if (cspf) {
136 dp->d_cspf = cspf;
137 cspf->cp_refcount++;
138 } else
139 csp->cp_refcount++;
140 }
141 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
142 csp->cp_refcount++;
143 *dpp = (void*)dp;
144 return 0;
145 }
146
147 static int
148 iconv_ucs_close(void *data)
149 {
150 struct iconv_ucs *dp = data;
151
152 if (dp->f_ctp)
153 iconv_close(dp->f_ctp);
154 if (dp->t_ctp)
155 iconv_close(dp->t_ctp);
156 if (dp->ctype)
157 iconv_close(dp->ctype);
158 if (dp->d_cspf)
159 dp->d_cspf->cp_refcount--;
160 else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
161 dp->d_csp->cp_refcount--;
162 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
163 dp->d_csp->cp_refcount--;
164 kobj_delete((struct kobj*)data, M_ICONV);
165 return 0;
166 }
167
168 static int
169 iconv_ucs_conv(void *d2p, const char **inbuf,
170 size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
171 int convchar, int casetype)
172 {
173 struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
174 int ret = 0, i;
175 size_t in, on, ir, or, inlen, outlen, ucslen;
176 const char *src, *p;
177 char *dst;
178 u_char ucs[4], *q;
179 uint32_t code;
180
181 if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
182 return 0;
183 ir = in = *inbytesleft;
184 or = on = *outbytesleft;
185 src = *inbuf;
186 dst = *outbuf;
187
188 while (ir > 0 && or > 0) {
189
190 /*
191 * The first half of conversion.
192 * (convert any code into ENCODING_UNICODE)
193 */
194 code = 0;
195 p = src;
196 if (dp->convtype & KICONV_UCS_FROM_UTF8) {
197 /* convert UTF-8 to ENCODING_UNICODE */
198 inlen = 0;
199 code = utf8_to_ucs4(p, &inlen, ir);
200 if (code == 0) {
201 ret = -1;
202 break;
203 }
204
205 if (casetype == KICONV_FROM_LOWER && dp->ctype) {
206 code = towlower(code, dp->ctype);
207 } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
208 code = towupper(code, dp->ctype);
209 }
210
211 if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
212 /* reserved for utf-16 surrogate pair */
213 /* invalid unicode */
214 ret = -1;
215 break;
216 }
217
218 if (inlen == 4) {
219 if (dp->convtype & KICONV_UCS_UCS4) {
220 ucslen = 4;
221 code = encode_surrogate(code);
222 } else {
223 /* can't handle with ucs-2 */
224 ret = -1;
225 break;
226 }
227 } else {
228 ucslen = 2;
229 }
230
231 /* save UCS-4 into ucs[] */
232 for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
233 *q++ = (code >> (i << 3)) & 0xff;
234
235 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
236 /* convert local code to ENCODING_UNICODE */
237 ucslen = 4;
238 inlen = ir;
239 q = ucs;
240 ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
241 &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
242 if (ret)
243 break;
244 inlen = ir - inlen;
245 ucslen = 4 - ucslen;
246
247 } else {
248 /* src code is a proper subset of ENCODING_UNICODE */
249 q = ucs;
250 if (dp->convtype & KICONV_UCS_FROM_LE) {
251 *q = *(p + 1);
252 *(q + 1) = *p;
253 p += 2;
254 } else {
255 *q = *p++;
256 *(q + 1) = *p++;
257 }
258 if ((*q & 0xfc) == 0xd8) {
259 if (dp->convtype & KICONV_UCS_UCS4 &&
260 dp->convtype & KICONV_UCS_FROM_UTF16) {
261 inlen = ucslen = 4;
262 } else {
263 /* invalid unicode */
264 ret = -1;
265 break;
266 }
267 } else {
268 inlen = ucslen = 2;
269 }
270 if (ir < inlen) {
271 ret = -1;
272 break;
273 }
274 if (ucslen == 4) {
275 q += 2;
276 if (dp->convtype & KICONV_UCS_FROM_LE) {
277 *q = *(p + 1);
278 *(q + 1) = *p;
279 } else {
280 *q = *p++;
281 *(q + 1) = *p;
282 }
283 if ((*q & 0xfc) != 0xdc) {
284 /* invalid unicode */
285 ret = -1;
286 break;
287 }
288 }
289 }
290
291 /*
292 * The second half of conversion.
293 * (convert ENCODING_UNICODE into any code)
294 */
295 p = ucs;
296 if (dp->convtype & KICONV_UCS_TO_UTF8) {
297 q = (u_char *)dst;
298 if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
299 /* decode surrogate pair */
300 code = decode_surrogate(p);
301 } else {
302 code = (ucs[0] << 8) | ucs[1];
303 }
304
305 if (casetype == KICONV_LOWER && dp->ctype) {
306 code = towlower(code, dp->ctype);
307 } else if (casetype == KICONV_UPPER && dp->ctype) {
308 code = towupper(code, dp->ctype);
309 }
310
311 outlen = 0;
312 if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
313 ret = -1;
314 break;
315 }
316
317 src += inlen;
318 ir -= inlen;
319 dst += outlen;
320 or -= outlen;
321
322 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
323 ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
324 &or, casetype & (KICONV_LOWER | KICONV_UPPER));
325 if (ret)
326 break;
327
328 src += inlen;
329 ir -= inlen;
330
331 } else {
332 /* dst code is a proper subset of ENCODING_UNICODE */
333 if (or < ucslen) {
334 ret = -1;
335 break;
336 }
337 src += inlen;
338 ir -= inlen;
339 or -= ucslen;
340 if (dp->convtype & KICONV_UCS_TO_LE) {
341 *dst++ = *(p + 1);
342 *dst++ = *p;
343 p += 2;
344 } else {
345 *dst++ = *p++;
346 *dst++ = *p++;
347 }
348 if (ucslen == 4) {
349 if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
350 (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
351 ret = -1;
352 break;
353 }
354 if (dp->convtype & KICONV_UCS_TO_LE) {
355 *dst++ = *(p + 1);
356 *dst++ = *p;
357 } else {
358 *dst++ = *p++;
359 *dst++ = *p;
360 }
361 }
362 }
363
364 if (convchar == 1)
365 break;
366 }
367
368 *inbuf += in - ir;
369 *outbuf += on - or;
370 *inbytesleft -= in - ir;
371 *outbytesleft -= on - or;
372 return (ret);
373 }
374
375 static int
376 iconv_ucs_init(struct iconv_converter_class *dcp)
377 {
378 int error;
379
380 error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
381 if (error)
382 return (error);
383 error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
384 if (error)
385 return (error);
386 return (0);
387 }
388
389 static int
390 iconv_ucs_done(struct iconv_converter_class *dcp)
391 {
392 return (0);
393 }
394
395 static const char *
396 iconv_ucs_name(struct iconv_converter_class *dcp)
397 {
398 return (ENCODING_UNICODE);
399 }
400
401 static kobj_method_t iconv_ucs_methods[] = {
402 KOBJMETHOD(iconv_converter_open, iconv_ucs_open),
403 KOBJMETHOD(iconv_converter_close, iconv_ucs_close),
404 KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv),
405 KOBJMETHOD(iconv_converter_init, iconv_ucs_init),
406 KOBJMETHOD(iconv_converter_done, iconv_ucs_done),
407 KOBJMETHOD(iconv_converter_name, iconv_ucs_name),
408 {0, 0}
409 };
410
411 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
412
413 static uint32_t
414 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
415 {
416 size_t i, w = 0;
417 uint32_t ucs4 = 0;
418
419 /*
420 * get leading 1 byte from utf-8
421 */
422 if ((*src & 0x80) == 0) {
423 /*
424 * leading 1 bit is ""
425 * utf-8: 0xxxxxxx
426 * ucs-4: 00000000 00000000 00000000 0xxxxxxx
427 */
428 w = 1;
429 /* get trailing 7 bits */
430 ucs4 = *src & 0x7f;
431 } else if ((*src & 0xe0) == 0xc0) {
432 /*
433 * leading 3 bits are "110"
434 * utf-8: 110xxxxx 10yyyyyy
435 * ucs-4: 00000000 00000000 00000xxx xxyyyyyy
436 */
437 w = 2;
438 /* get trailing 5 bits */
439 ucs4 = *src & 0x1f;
440 } else if ((*src & 0xf0) == 0xe0) {
441 /*
442 * leading 4 bits are "1110"
443 * utf-8: 1110xxxx 10yyyyyy 10zzzzzz
444 * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
445 */
446 w = 3;
447 /* get trailing 4 bits */
448 ucs4 = *src & 0x0f;
449 } else if ((*src & 0xf8) == 0xf0) {
450 /*
451 * leading 5 bits are "11110"
452 * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
453 * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
454 */
455 w = 4;
456 /* get trailing 3 bits */
457 ucs4 = *src & 0x07;
458 } else {
459 /* out of utf-16 range or having illegal bits */
460 return (0);
461 }
462
463 if (srclen < w)
464 return (0);
465
466 /*
467 * get left parts from utf-8
468 */
469 for (i = 1 ; i < w ; i++) {
470 if ((*(src + i) & 0xc0) != 0x80) {
471 /* invalid: leading 2 bits are not "10" */
472 return (0);
473 }
474 /* concatenate trailing 6 bits into ucs4 */
475 ucs4 <<= 6;
476 ucs4 |= *(src + i) & 0x3f;
477 }
478
479 *utf8width = w;
480 return (ucs4);
481 }
482
483 static u_char *
484 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
485 {
486 u_char lead, *p;
487 size_t i, w;
488
489 /*
490 * determine utf-8 width and leading bits
491 */
492 if (ucs4 < 0x80) {
493 w = 1;
494 lead = 0; /* "" */
495 } else if (ucs4 < 0x800) {
496 w = 2;
497 lead = 0xc0; /* "11" */
498 } else if (ucs4 < 0x10000) {
499 w = 3;
500 lead = 0xe0; /* "111" */
501 } else if (ucs4 < 0x200000) {
502 w = 4;
503 lead = 0xf0; /* "1111" */
504 } else {
505 return (NULL);
506 }
507
508 if (dstlen < w)
509 return (NULL);
510
511 /*
512 * construct utf-8
513 */
514 p = dst;
515 for (i = w - 1 ; i >= 1 ; i--) {
516 /* get trailing 6 bits and put it with leading bit as "1" */
517 *(p + i) = (ucs4 & 0x3f) | 0x80;
518 ucs4 >>= 6;
519 }
520 *p = ucs4 | lead;
521
522 *utf8width = w;
523
524 return (p);
525 }
526
527 static uint32_t
528 encode_surrogate(uint32_t code)
529 {
530 return ((((code - 0x10000) << 6) & 0x3ff0000) |
531 ((code - 0x10000) & 0x3ff) | 0xd800dc00);
532 }
533
534 static uint32_t
535 decode_surrogate(const u_char *ucs)
536 {
537 return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
538 ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
539 }
540
Cache object: a0e9135d976a05c4f96d6c2c08dec00e
|