iconv_ucs.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2003, 2005 Ryuichiro Imura
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  */
   28 
   29 #include <sys/cdefs.h>
   30 __FBSDID("$FreeBSD$");
   31 
   32 #include <sys/param.h>
   33 #include <sys/kernel.h>
   34 #include <sys/systm.h>
   35 #include <sys/malloc.h>
   36 #include <sys/iconv.h>
   37 
   38 #include "iconv_converter_if.h"
   39 
   40 /*
   41  * "UCS" converter
   42  */
   43 
   44 #define KICONV_UCS_COMBINE      0x1
   45 #define KICONV_UCS_FROM_UTF8    0x2
   46 #define KICONV_UCS_TO_UTF8      0x4
   47 #define KICONV_UCS_FROM_LE      0x8
   48 #define KICONV_UCS_TO_LE        0x10
   49 #define KICONV_UCS_FROM_UTF16   0x20
   50 #define KICONV_UCS_TO_UTF16     0x40
   51 #define KICONV_UCS_UCS4         0x80
   52 
   53 #define ENCODING_UTF16  "UTF-16BE"
   54 #define ENCODING_UTF8   "UTF-8"
   55 
   56 static struct {
   57         const char *name;
   58         int from_flag, to_flag;
   59 } unicode_family[] = {
   60         { "UTF-8",      KICONV_UCS_FROM_UTF8,   KICONV_UCS_TO_UTF8 },
   61         { "UCS-2LE",    KICONV_UCS_FROM_LE,     KICONV_UCS_TO_LE },
   62         { "UTF-16BE",   KICONV_UCS_FROM_UTF16,  KICONV_UCS_TO_UTF16 },
   63         { "UTF-16LE",   KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
   64             KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
   65         { NULL,         0,      0 }
   66 };
   67 
   68 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
   69 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
   70 static uint32_t encode_surrogate(uint32_t code);
   71 static uint32_t decode_surrogate(const u_char *ucs);
   72 
   73 #ifdef MODULE_DEPEND
   74 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
   75 #endif
   76 
   77 /*
   78  * UCS converter instance
   79  */
   80 struct iconv_ucs {
   81         KOBJ_FIELDS;
   82         int                     convtype;
   83         struct iconv_cspair *   d_csp;
   84         struct iconv_cspair *   d_cspf;
   85         void *                  f_ctp;
   86         void *                  t_ctp;
   87         void *                  ctype;
   88 };
   89 
   90 static int
   91 iconv_ucs_open(struct iconv_converter_class *dcp,
   92         struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
   93 {
   94         struct iconv_ucs *dp;
   95         int i;
   96         const char *from, *to;
   97 
   98         dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
   99         to = csp->cp_to;
  100         from = cspf ? cspf->cp_from : csp->cp_from;
  101 
  102         dp->convtype = 0;
  103 
  104         if (cspf)
  105                 dp->convtype |= KICONV_UCS_COMBINE;
  106         for (i = 0; unicode_family[i].name; i++) {
  107                 if (strcasecmp(from, unicode_family[i].name) == 0)
  108                         dp->convtype |= unicode_family[i].from_flag;
  109                 if (strcasecmp(to, unicode_family[i].name) == 0)
  110                         dp->convtype |= unicode_family[i].to_flag;
  111         }
  112         if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
  113                 dp->convtype |= KICONV_UCS_UCS4;
  114         else
  115                 dp->convtype &= ~KICONV_UCS_UCS4;
  116 
  117         dp->f_ctp = dp->t_ctp = NULL;
  118         if (dp->convtype & KICONV_UCS_COMBINE) {
  119                 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
  120                     (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
  121                         iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
  122                 }
  123                 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
  124                     (dp->convtype & KICONV_UCS_TO_LE) == 0) {
  125                         iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
  126                 }
  127         }
  128 
  129         dp->ctype = NULL;
  130         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
  131                 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
  132 
  133         dp->d_csp = csp;
  134         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
  135                 if (cspf) {
  136                         dp->d_cspf = cspf;
  137                         cspf->cp_refcount++;
  138                 } else
  139                         csp->cp_refcount++;
  140         }
  141         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
  142                 csp->cp_refcount++;
  143         *dpp = (void*)dp;
  144         return 0;
  145 }
  146 
  147 static int
  148 iconv_ucs_close(void *data)
  149 {
  150         struct iconv_ucs *dp = data;
  151 
  152         if (dp->f_ctp)
  153                 iconv_close(dp->f_ctp);
  154         if (dp->t_ctp)
  155                 iconv_close(dp->t_ctp);
  156         if (dp->ctype)
  157                 iconv_close(dp->ctype);
  158         if (dp->d_cspf)
  159                 dp->d_cspf->cp_refcount--;
  160         else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
  161                 dp->d_csp->cp_refcount--;
  162         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
  163                 dp->d_csp->cp_refcount--;
  164         kobj_delete((struct kobj*)data, M_ICONV);
  165         return 0;
  166 }
  167 
  168 static int
  169 iconv_ucs_conv(void *d2p, const char **inbuf,
  170         size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
  171         int convchar, int casetype)
  172 {
  173         struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
  174         int ret = 0, i;
  175         size_t in, on, ir, or, inlen, outlen, ucslen;
  176         const char *src, *p;
  177         char *dst;
  178         u_char ucs[4], *q;
  179         uint32_t code;
  180 
  181         if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
  182                 return 0;
  183         ir = in = *inbytesleft;
  184         or = on = *outbytesleft;
  185         src = *inbuf;
  186         dst = *outbuf;
  187 
  188         while (ir > 0 && or > 0) {
  189                 /*
  190                  * The first half of conversion.
  191                  * (convert any code into ENCODING_UNICODE)
  192                  */
  193                 code = 0;
  194                 p = src;
  195                 if (dp->convtype & KICONV_UCS_FROM_UTF8) {
  196                         /* convert UTF-8 to ENCODING_UNICODE */
  197                         inlen = 0;
  198                         code = utf8_to_ucs4(p, &inlen, ir);
  199                         if (code == 0) {
  200                                 ret = -1;
  201                                 break;
  202                         }
  203 
  204                         if (casetype == KICONV_FROM_LOWER && dp->ctype) {
  205                                 code = towlower(code, dp->ctype);
  206                         } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
  207                                 code = towupper(code, dp->ctype);
  208                         }
  209 
  210                         if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
  211                                 /* reserved for utf-16 surrogate pair */
  212                                 /* invalid unicode */
  213                                 ret = -1;
  214                                 break;
  215                         }
  216 
  217                         if (inlen == 4) {
  218                                 if (dp->convtype & KICONV_UCS_UCS4) {
  219                                         ucslen = 4;
  220                                         code = encode_surrogate(code);
  221                                 } else {
  222                                         /* can't handle with ucs-2 */
  223                                         ret = -1;
  224                                         break;
  225                                 }
  226                         } else {
  227                                 ucslen = 2;
  228                         }
  229 
  230                         /* save UCS-4 into ucs[] */
  231                         for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
  232                                 *q++ = (code >> (i << 3)) & 0xff;
  233 
  234                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
  235                         /* convert local code to ENCODING_UNICODE */
  236                         ucslen = 4;
  237                         inlen = ir;
  238                         q = ucs;
  239                         ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
  240                             &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
  241                         if (ret)
  242                                 break;
  243                         inlen = ir - inlen;
  244                         ucslen = 4 - ucslen;
  245 
  246                 } else {
  247                         /* src code is a proper subset of ENCODING_UNICODE */
  248                         q = ucs;
  249                         if (dp->convtype & KICONV_UCS_FROM_LE) {
  250                                 *q = *(p + 1);
  251                                 *(q + 1) = *p;
  252                                 p += 2;
  253                         } else {
  254                                 *q = *p++;
  255                                 *(q + 1) = *p++;
  256                         }
  257                         if ((*q & 0xfc) == 0xd8) {
  258                                 if (dp->convtype & KICONV_UCS_UCS4 &&
  259                                     dp->convtype & KICONV_UCS_FROM_UTF16) {
  260                                         inlen = ucslen = 4;
  261                                 } else {
  262                                         /* invalid unicode */
  263                                         ret = -1;
  264                                         break;
  265                                 }
  266                         } else {
  267                                 inlen = ucslen = 2;
  268                         }
  269                         if (ir < inlen) {
  270                                 ret = -1;
  271                                 break;
  272                         }
  273                         if (ucslen == 4) {
  274                                 q += 2;
  275                                 if (dp->convtype & KICONV_UCS_FROM_LE) {
  276                                         *q = *(p + 1);
  277                                         *(q + 1) = *p;
  278                                 } else {
  279                                         *q = *p++;
  280                                         *(q + 1) = *p;
  281                                 }
  282                                 if ((*q & 0xfc) != 0xdc) {
  283                                         /* invalid unicode */
  284                                         ret = -1;
  285                                         break;
  286                                 }
  287                         }
  288                 }
  289 
  290                 /*
  291                  * The second half of conversion.
  292                  * (convert ENCODING_UNICODE into any code)
  293                  */
  294                 p = ucs;
  295                 if (dp->convtype & KICONV_UCS_TO_UTF8) {
  296                         q = (u_char *)dst;
  297                         if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
  298                                 /* decode surrogate pair */
  299                                 code = decode_surrogate(p);
  300                         } else {
  301                                 code = (ucs[0] << 8) | ucs[1];
  302                         }
  303 
  304                         if (casetype == KICONV_LOWER && dp->ctype) {
  305                                 code = towlower(code, dp->ctype);
  306                         } else if (casetype == KICONV_UPPER && dp->ctype) {
  307                                 code = towupper(code, dp->ctype);
  308                         }
  309 
  310                         outlen = 0;
  311                         if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
  312                                 ret = -1;
  313                                 break;
  314                         }
  315 
  316                         src += inlen;
  317                         ir -= inlen;
  318                         dst += outlen;
  319                         or -= outlen;
  320 
  321                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
  322                         ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
  323                             &or, casetype & (KICONV_LOWER | KICONV_UPPER));
  324                         if (ret)
  325                                 break;
  326 
  327                         src += inlen;
  328                         ir -= inlen;
  329 
  330                 } else {
  331                         /* dst code is a proper subset of ENCODING_UNICODE */
  332                         if (or < ucslen) {
  333                                 ret = -1;
  334                                 break;
  335                         }
  336                         src += inlen;
  337                         ir -= inlen;
  338                         or -= ucslen;
  339                         if (dp->convtype & KICONV_UCS_TO_LE) {
  340                                 *dst++ = *(p + 1);
  341                                 *dst++ = *p;
  342                                 p += 2;
  343                         } else {
  344                                 *dst++ = *p++;
  345                                 *dst++ = *p++;
  346                         }
  347                         if (ucslen == 4) {
  348                                 if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
  349                                     (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
  350                                         ret = -1;
  351                                         break;
  352                                 }
  353                                 if (dp->convtype & KICONV_UCS_TO_LE) {
  354                                         *dst++ = *(p + 1);
  355                                         *dst++ = *p;
  356                                 } else {
  357                                         *dst++ = *p++;
  358                                         *dst++ = *p;
  359                                 }
  360                         }
  361                 }
  362 
  363                 if (convchar == 1)
  364                         break;
  365         }
  366 
  367         *inbuf += in - ir;
  368         *outbuf += on - or;
  369         *inbytesleft -= in - ir;
  370         *outbytesleft -= on - or;
  371         return (ret);
  372 }
  373 
  374 static int
  375 iconv_ucs_init(struct iconv_converter_class *dcp)
  376 {
  377         int error;
  378 
  379         error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
  380         if (error)
  381                 return (error);
  382         error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
  383         if (error)
  384                 return (error);
  385         return (0);
  386 }
  387 
  388 static int
  389 iconv_ucs_done(struct iconv_converter_class *dcp)
  390 {
  391         return (0);
  392 }
  393 
  394 static const char *
  395 iconv_ucs_name(struct iconv_converter_class *dcp)
  396 {
  397         return (ENCODING_UNICODE);
  398 }
  399 
  400 static kobj_method_t iconv_ucs_methods[] = {
  401         KOBJMETHOD(iconv_converter_open,        iconv_ucs_open),
  402         KOBJMETHOD(iconv_converter_close,       iconv_ucs_close),
  403         KOBJMETHOD(iconv_converter_conv,        iconv_ucs_conv),
  404         KOBJMETHOD(iconv_converter_init,        iconv_ucs_init),
  405         KOBJMETHOD(iconv_converter_done,        iconv_ucs_done),
  406         KOBJMETHOD(iconv_converter_name,        iconv_ucs_name),
  407         {0, 0}
  408 };
  409 
  410 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
  411 
  412 static uint32_t
  413 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
  414 {
  415         size_t i, w = 0;
  416         uint32_t ucs4 = 0;
  417 
  418         /*
  419          * get leading 1 byte from utf-8
  420          */
  421         if ((*src & 0x80) == 0) {
  422                 /*
  423                  * leading 1 bit is ""
  424                  *  utf-8: 0xxxxxxx
  425                  *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
  426                  */
  427                 w = 1;
  428                 /* get trailing 7 bits */
  429                 ucs4 = *src & 0x7f;
  430         } else if ((*src & 0xe0) == 0xc0) {
  431                 /*
  432                  * leading 3 bits are "110"
  433                  *  utf-8: 110xxxxx 10yyyyyy
  434                  *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
  435                  */
  436                 w = 2;
  437                 /* get trailing 5 bits */
  438                 ucs4 = *src & 0x1f;
  439         } else if ((*src & 0xf0) == 0xe0) {
  440                 /*
  441                  * leading 4 bits are "1110"
  442                  *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
  443                  *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
  444                  */
  445                 w = 3;
  446                 /* get trailing 4 bits */
  447                 ucs4 = *src & 0x0f;
  448         } else if ((*src & 0xf8) == 0xf0) {
  449                 /*
  450                  * leading 5 bits are "11110"
  451                  *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
  452                  *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
  453                  */
  454                 w = 4;
  455                 /* get trailing 3 bits */
  456                 ucs4 = *src & 0x07;
  457         } else {
  458                 /* out of utf-16 range or having illegal bits */
  459                 return (0);
  460         }
  461 
  462         if (srclen < w)
  463                 return (0);
  464 
  465         /*
  466          * get left parts from utf-8
  467          */
  468         for (i = 1 ; i < w ; i++) {
  469                 if ((*(src + i) & 0xc0) != 0x80) {
  470                         /* invalid: leading 2 bits are not "10" */
  471                         return (0);
  472                 }
  473                 /* concatenate trailing 6 bits into ucs4 */
  474                 ucs4 <<= 6;
  475                 ucs4 |= *(src + i) & 0x3f;
  476         }
  477 
  478         *utf8width = w;
  479         return (ucs4);
  480 }
  481 
  482 static u_char *
  483 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
  484 {
  485         u_char lead, *p;
  486         size_t i, w;
  487 
  488         /*
  489          * determine utf-8 width and leading bits
  490          */
  491         if (ucs4 < 0x80) {
  492                 w = 1;
  493                 lead = 0;       /* "" */
  494         } else if (ucs4 < 0x800) {
  495                 w = 2;
  496                 lead = 0xc0;    /* "11" */
  497         } else if (ucs4 < 0x10000) {
  498                 w = 3;
  499                 lead = 0xe0;    /* "111" */
  500         } else if (ucs4 < 0x200000) {
  501                 w = 4;
  502                 lead = 0xf0;    /* "1111" */
  503         } else {
  504                 return (NULL);
  505         }
  506 
  507         if (dstlen < w)
  508                 return (NULL);
  509 
  510         /*
  511          * construct utf-8
  512          */
  513         p = dst;
  514         for (i = w - 1 ; i >= 1 ; i--) {
  515                 /* get trailing 6 bits and put it with leading bit as "1" */
  516                 *(p + i) = (ucs4 & 0x3f) | 0x80;
  517                 ucs4 >>= 6;
  518         }
  519         *p = ucs4 | lead;
  520 
  521         *utf8width = w;
  522 
  523         return (p);
  524 }
  525 
  526 static uint32_t
  527 encode_surrogate(uint32_t code)
  528 {
  529         return ((((code - 0x10000) << 6) & 0x3ff0000) |
  530             ((code - 0x10000) & 0x3ff) | 0xd800dc00);
  531 }
  532 
  533 static uint32_t
  534 decode_surrogate(const u_char *ucs)
  535 {
  536         return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
  537             ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
  538 }
Cache object: 305ae25028a85973bfbe6ee3ff28a986
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/libkern/iconv_ucs.c

FreeBSD/Linux Kernel Cross Reference
sys/libkern/iconv_ucs.c