1 /*-
2 * Copyright (c) 2003, 2005 Ryuichiro Imura
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD: releng/8.3/sys/libkern/iconv_ucs.c 230205 2012-01-16 08:29:52Z kevlo $");
29
30 #include <sys/param.h>
31 #include <sys/kernel.h>
32 #include <sys/systm.h>
33 #include <sys/malloc.h>
34 #include <sys/iconv.h>
35
36 #include "iconv_converter_if.h"
37
38 /*
39 * "UCS" converter
40 */
41
42 #define KICONV_UCS_COMBINE 0x1
43 #define KICONV_UCS_FROM_UTF8 0x2
44 #define KICONV_UCS_TO_UTF8 0x4
45 #define KICONV_UCS_FROM_LE 0x8
46 #define KICONV_UCS_TO_LE 0x10
47 #define KICONV_UCS_FROM_UTF16 0x20
48 #define KICONV_UCS_TO_UTF16 0x40
49 #define KICONV_UCS_UCS4 0x80
50
51 #define ENCODING_UTF16 "UTF-16BE"
52 #define ENCODING_UTF8 "UTF-8"
53
54 static struct {
55 const char *name;
56 int from_flag, to_flag;
57 } unicode_family[] = {
58 { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 },
59 { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE },
60 { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 },
61 { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
62 KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
63 { NULL, 0, 0 }
64 };
65
66 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
67 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
68 static uint32_t encode_surrogate(uint32_t code);
69 static uint32_t decode_surrogate(const u_char *ucs);
70
71 #ifdef MODULE_DEPEND
72 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
73 #endif
74
75 /*
76 * UCS converter instance
77 */
78 struct iconv_ucs {
79 KOBJ_FIELDS;
80 int convtype;
81 struct iconv_cspair * d_csp;
82 struct iconv_cspair * d_cspf;
83 void * f_ctp;
84 void * t_ctp;
85 void * ctype;
86 };
87
88 static int
89 iconv_ucs_open(struct iconv_converter_class *dcp,
90 struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
91 {
92 struct iconv_ucs *dp;
93 int i;
94 const char *from, *to;
95
96 dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
97 to = csp->cp_to;
98 from = cspf ? cspf->cp_from : csp->cp_from;
99
100 dp->convtype = 0;
101
102 if (cspf)
103 dp->convtype |= KICONV_UCS_COMBINE;
104 for (i = 0; unicode_family[i].name; i++) {
105 if (strcmp(from, unicode_family[i].name) == 0)
106 dp->convtype |= unicode_family[i].from_flag;
107 if (strcmp(to, unicode_family[i].name) == 0)
108 dp->convtype |= unicode_family[i].to_flag;
109 }
110 if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
111 dp->convtype |= KICONV_UCS_UCS4;
112 else
113 dp->convtype &= ~KICONV_UCS_UCS4;
114
115 dp->f_ctp = dp->t_ctp = NULL;
116 if (dp->convtype & KICONV_UCS_COMBINE) {
117 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
118 (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
119 iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
120 }
121 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
122 (dp->convtype & KICONV_UCS_TO_LE) == 0) {
123 iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
124 }
125 }
126
127 dp->ctype = NULL;
128 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
129 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
130
131 dp->d_csp = csp;
132 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
133 if (cspf) {
134 dp->d_cspf = cspf;
135 cspf->cp_refcount++;
136 } else
137 csp->cp_refcount++;
138 }
139 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
140 csp->cp_refcount++;
141 *dpp = (void*)dp;
142 return 0;
143 }
144
145 static int
146 iconv_ucs_close(void *data)
147 {
148 struct iconv_ucs *dp = data;
149
150 if (dp->f_ctp)
151 iconv_close(dp->f_ctp);
152 if (dp->t_ctp)
153 iconv_close(dp->t_ctp);
154 if (dp->ctype)
155 iconv_close(dp->ctype);
156 if (dp->d_cspf)
157 dp->d_cspf->cp_refcount--;
158 else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
159 dp->d_csp->cp_refcount--;
160 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
161 dp->d_csp->cp_refcount--;
162 kobj_delete((struct kobj*)data, M_ICONV);
163 return 0;
164 }
165
166 static int
167 iconv_ucs_conv(void *d2p, const char **inbuf,
168 size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
169 int convchar, int casetype)
170 {
171 struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
172 int ret = 0, i;
173 size_t in, on, ir, or, inlen, outlen, ucslen;
174 const char *src, *p;
175 char *dst;
176 u_char ucs[4], *q;
177 uint32_t code;
178
179 if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
180 return 0;
181 ir = in = *inbytesleft;
182 or = on = *outbytesleft;
183 src = *inbuf;
184 dst = *outbuf;
185
186 while (ir > 0 && or > 0) {
187
188 /*
189 * The first half of conversion.
190 * (convert any code into ENCODING_UNICODE)
191 */
192 code = 0;
193 p = src;
194 if (dp->convtype & KICONV_UCS_FROM_UTF8) {
195 /* convert UTF-8 to ENCODING_UNICODE */
196 inlen = 0;
197 code = utf8_to_ucs4(p, &inlen, ir);
198 if (code == 0) {
199 ret = -1;
200 break;
201 }
202
203 if (casetype == KICONV_FROM_LOWER && dp->ctype) {
204 code = towlower(code, dp->ctype);
205 } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
206 code = towupper(code, dp->ctype);
207 }
208
209 if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
210 /* reserved for utf-16 surrogate pair */
211 /* invalid unicode */
212 ret = -1;
213 break;
214 }
215
216 if (inlen == 4) {
217 if (dp->convtype & KICONV_UCS_UCS4) {
218 ucslen = 4;
219 code = encode_surrogate(code);
220 } else {
221 /* can't handle with ucs-2 */
222 ret = -1;
223 break;
224 }
225 } else {
226 ucslen = 2;
227 }
228
229 /* save UCS-4 into ucs[] */
230 for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
231 *q++ = (code >> (i << 3)) & 0xff;
232
233 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
234 /* convert local code to ENCODING_UNICODE */
235 ucslen = 4;
236 inlen = ir;
237 q = ucs;
238 ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
239 &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
240 if (ret)
241 break;
242 inlen = ir - inlen;
243 ucslen = 4 - ucslen;
244
245 } else {
246 /* src code is a proper subset of ENCODING_UNICODE */
247 q = ucs;
248 if (dp->convtype & KICONV_UCS_FROM_LE) {
249 *q = *(p + 1);
250 *(q + 1) = *p;
251 p += 2;
252 } else {
253 *q = *p++;
254 *(q + 1) = *p++;
255 }
256 if ((*q & 0xfc) == 0xd8) {
257 if (dp->convtype & KICONV_UCS_UCS4 &&
258 dp->convtype & KICONV_UCS_FROM_UTF16) {
259 inlen = ucslen = 4;
260 } else {
261 /* invalid unicode */
262 ret = -1;
263 break;
264 }
265 } else {
266 inlen = ucslen = 2;
267 }
268 if (ir < inlen) {
269 ret = -1;
270 break;
271 }
272 if (ucslen == 4) {
273 q += 2;
274 if (dp->convtype & KICONV_UCS_FROM_LE) {
275 *q = *(p + 1);
276 *(q + 1) = *p;
277 } else {
278 *q = *p++;
279 *(q + 1) = *p;
280 }
281 if ((*q & 0xfc) != 0xdc) {
282 /* invalid unicode */
283 ret = -1;
284 break;
285 }
286 }
287 }
288
289 /*
290 * The second half of conversion.
291 * (convert ENCODING_UNICODE into any code)
292 */
293 p = ucs;
294 if (dp->convtype & KICONV_UCS_TO_UTF8) {
295 q = (u_char *)dst;
296 if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
297 /* decode surrogate pair */
298 code = decode_surrogate(p);
299 } else {
300 code = (ucs[0] << 8) | ucs[1];
301 }
302
303 if (casetype == KICONV_LOWER && dp->ctype) {
304 code = towlower(code, dp->ctype);
305 } else if (casetype == KICONV_UPPER && dp->ctype) {
306 code = towupper(code, dp->ctype);
307 }
308
309 outlen = 0;
310 if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
311 ret = -1;
312 break;
313 }
314
315 src += inlen;
316 ir -= inlen;
317 dst += outlen;
318 or -= outlen;
319
320 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
321 ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
322 &or, casetype & (KICONV_LOWER | KICONV_UPPER));
323 if (ret)
324 break;
325
326 src += inlen;
327 ir -= inlen;
328
329 } else {
330 /* dst code is a proper subset of ENCODING_UNICODE */
331 if (or < ucslen) {
332 ret = -1;
333 break;
334 }
335 src += inlen;
336 ir -= inlen;
337 or -= ucslen;
338 if (dp->convtype & KICONV_UCS_TO_LE) {
339 *dst++ = *(p + 1);
340 *dst++ = *p;
341 p += 2;
342 } else {
343 *dst++ = *p++;
344 *dst++ = *p++;
345 }
346 if (ucslen == 4) {
347 if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
348 (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
349 ret = -1;
350 break;
351 }
352 if (dp->convtype & KICONV_UCS_TO_LE) {
353 *dst++ = *(p + 1);
354 *dst++ = *p;
355 } else {
356 *dst++ = *p++;
357 *dst++ = *p;
358 }
359 }
360 }
361
362 if (convchar == 1)
363 break;
364 }
365
366 *inbuf += in - ir;
367 *outbuf += on - or;
368 *inbytesleft -= in - ir;
369 *outbytesleft -= on - or;
370 return (ret);
371 }
372
373 static int
374 iconv_ucs_init(struct iconv_converter_class *dcp)
375 {
376 int error;
377
378 error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
379 if (error)
380 return (error);
381 error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
382 if (error)
383 return (error);
384 return (0);
385 }
386
387 static int
388 iconv_ucs_done(struct iconv_converter_class *dcp)
389 {
390 return (0);
391 }
392
393 static const char *
394 iconv_ucs_name(struct iconv_converter_class *dcp)
395 {
396 return (ENCODING_UNICODE);
397 }
398
399 static kobj_method_t iconv_ucs_methods[] = {
400 KOBJMETHOD(iconv_converter_open, iconv_ucs_open),
401 KOBJMETHOD(iconv_converter_close, iconv_ucs_close),
402 KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv),
403 KOBJMETHOD(iconv_converter_init, iconv_ucs_init),
404 KOBJMETHOD(iconv_converter_done, iconv_ucs_done),
405 KOBJMETHOD(iconv_converter_name, iconv_ucs_name),
406 {0, 0}
407 };
408
409 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
410
411 static uint32_t
412 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
413 {
414 size_t i, w = 0;
415 uint32_t ucs4 = 0;
416
417 /*
418 * get leading 1 byte from utf-8
419 */
420 if ((*src & 0x80) == 0) {
421 /*
422 * leading 1 bit is ""
423 * utf-8: 0xxxxxxx
424 * ucs-4: 00000000 00000000 00000000 0xxxxxxx
425 */
426 w = 1;
427 /* get trailing 7 bits */
428 ucs4 = *src & 0x7f;
429 } else if ((*src & 0xe0) == 0xc0) {
430 /*
431 * leading 3 bits are "110"
432 * utf-8: 110xxxxx 10yyyyyy
433 * ucs-4: 00000000 00000000 00000xxx xxyyyyyy
434 */
435 w = 2;
436 /* get trailing 5 bits */
437 ucs4 = *src & 0x1f;
438 } else if ((*src & 0xf0) == 0xe0) {
439 /*
440 * leading 4 bits are "1110"
441 * utf-8: 1110xxxx 10yyyyyy 10zzzzzz
442 * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
443 */
444 w = 3;
445 /* get trailing 4 bits */
446 ucs4 = *src & 0x0f;
447 } else if ((*src & 0xf8) == 0xf0) {
448 /*
449 * leading 5 bits are "11110"
450 * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
451 * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
452 */
453 w = 4;
454 /* get trailing 3 bits */
455 ucs4 = *src & 0x07;
456 } else {
457 /* out of utf-16 range or having illegal bits */
458 return (0);
459 }
460 if (w == 0)
461 return (0);
462
463 if (srclen < w)
464 return (0);
465
466 /*
467 * get left parts from utf-8
468 */
469 for (i = 1 ; i < w ; i++) {
470 if ((*(src + i) & 0xc0) != 0x80) {
471 /* invalid: leading 2 bits are not "10" */
472 return (0);
473 }
474 /* concatenate trailing 6 bits into ucs4 */
475 ucs4 <<= 6;
476 ucs4 |= *(src + i) & 0x3f;
477 }
478
479 *utf8width = w;
480 return (ucs4);
481 }
482
483 static u_char *
484 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
485 {
486 u_char lead, *p;
487 size_t i, w;
488
489 /*
490 * determine utf-8 width and leading bits
491 */
492 if (ucs4 < 0x80) {
493 w = 1;
494 lead = 0; /* "" */
495 } else if (ucs4 < 0x800) {
496 w = 2;
497 lead = 0xc0; /* "11" */
498 } else if (ucs4 < 0x10000) {
499 w = 3;
500 lead = 0xe0; /* "111" */
501 } else if (ucs4 < 0x200000) {
502 w = 4;
503 lead = 0xf0; /* "1111" */
504 } else {
505 return (NULL);
506 }
507
508 if (dstlen < w)
509 return (NULL);
510
511 /*
512 * construct utf-8
513 */
514 p = dst;
515 for (i = w - 1 ; i >= 1 ; i--) {
516 /* get trailing 6 bits and put it with leading bit as "1" */
517 *(p + i) = (ucs4 & 0x3f) | 0x80;
518 ucs4 >>= 6;
519 }
520 *p = ucs4 | lead;
521
522 *utf8width = w;
523
524 return (p);
525 }
526
527 static uint32_t
528 encode_surrogate(register uint32_t code)
529 {
530 return ((((code - 0x10000) << 6) & 0x3ff0000) |
531 ((code - 0x10000) & 0x3ff) | 0xd800dc00);
532 }
533
534 static uint32_t
535 decode_surrogate(register const u_char *ucs)
536 {
537 return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
538 ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
539 }
540
Cache object: 8413b89f8909baf940c5c098a0e87630
|