iconv_ucs.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 2003, 2005 Ryuichiro Imura
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: releng/8.3/sys/libkern/iconv_ucs.c 230205 2012-01-16 08:29:52Z kevlo $");
   29 
   30 #include <sys/param.h>
   31 #include <sys/kernel.h>
   32 #include <sys/systm.h>
   33 #include <sys/malloc.h>
   34 #include <sys/iconv.h>
   35 
   36 #include "iconv_converter_if.h"
   37 
   38 /*
   39  * "UCS" converter
   40  */
   41 
   42 #define KICONV_UCS_COMBINE      0x1
   43 #define KICONV_UCS_FROM_UTF8    0x2
   44 #define KICONV_UCS_TO_UTF8      0x4
   45 #define KICONV_UCS_FROM_LE      0x8
   46 #define KICONV_UCS_TO_LE        0x10
   47 #define KICONV_UCS_FROM_UTF16   0x20
   48 #define KICONV_UCS_TO_UTF16     0x40
   49 #define KICONV_UCS_UCS4         0x80
   50 
   51 #define ENCODING_UTF16  "UTF-16BE"
   52 #define ENCODING_UTF8   "UTF-8"
   53 
   54 static struct {
   55         const char *name;
   56         int from_flag, to_flag;
   57 } unicode_family[] = {
   58         { "UTF-8",      KICONV_UCS_FROM_UTF8,   KICONV_UCS_TO_UTF8 },
   59         { "UCS-2LE",    KICONV_UCS_FROM_LE,     KICONV_UCS_TO_LE },
   60         { "UTF-16BE",   KICONV_UCS_FROM_UTF16,  KICONV_UCS_TO_UTF16 },
   61         { "UTF-16LE",   KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
   62             KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
   63         { NULL,         0,      0 }
   64 };
   65 
   66 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
   67 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
   68 static uint32_t encode_surrogate(uint32_t code);
   69 static uint32_t decode_surrogate(const u_char *ucs);
   70 
   71 #ifdef MODULE_DEPEND
   72 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
   73 #endif
   74 
   75 /*
   76  * UCS converter instance
   77  */
   78 struct iconv_ucs {
   79         KOBJ_FIELDS;
   80         int                     convtype;
   81         struct iconv_cspair *   d_csp;
   82         struct iconv_cspair *   d_cspf;
   83         void *                  f_ctp;
   84         void *                  t_ctp;
   85         void *                  ctype;
   86 };
   87 
   88 static int
   89 iconv_ucs_open(struct iconv_converter_class *dcp,
   90         struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
   91 {
   92         struct iconv_ucs *dp;
   93         int i;
   94         const char *from, *to;
   95 
   96         dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
   97         to = csp->cp_to;
   98         from = cspf ? cspf->cp_from : csp->cp_from;
   99 
  100         dp->convtype = 0;
  101 
  102         if (cspf)
  103                 dp->convtype |= KICONV_UCS_COMBINE;
  104         for (i = 0; unicode_family[i].name; i++) {
  105                 if (strcmp(from, unicode_family[i].name) == 0)
  106                         dp->convtype |= unicode_family[i].from_flag;
  107                 if (strcmp(to, unicode_family[i].name) == 0)
  108                         dp->convtype |= unicode_family[i].to_flag;
  109         }
  110         if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
  111                 dp->convtype |= KICONV_UCS_UCS4;
  112         else
  113                 dp->convtype &= ~KICONV_UCS_UCS4;
  114 
  115         dp->f_ctp = dp->t_ctp = NULL;
  116         if (dp->convtype & KICONV_UCS_COMBINE) {
  117                 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
  118                     (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
  119                         iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
  120                 }
  121                 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
  122                     (dp->convtype & KICONV_UCS_TO_LE) == 0) {
  123                         iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
  124                 }
  125         }
  126 
  127         dp->ctype = NULL;
  128         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
  129                 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
  130 
  131         dp->d_csp = csp;
  132         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
  133                 if (cspf) {
  134                         dp->d_cspf = cspf;
  135                         cspf->cp_refcount++;
  136                 } else
  137                         csp->cp_refcount++;
  138         }
  139         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
  140                 csp->cp_refcount++;
  141         *dpp = (void*)dp;
  142         return 0;
  143 }
  144 
  145 static int
  146 iconv_ucs_close(void *data)
  147 {
  148         struct iconv_ucs *dp = data;
  149 
  150         if (dp->f_ctp)
  151                 iconv_close(dp->f_ctp);
  152         if (dp->t_ctp)
  153                 iconv_close(dp->t_ctp);
  154         if (dp->ctype)
  155                 iconv_close(dp->ctype);
  156         if (dp->d_cspf)
  157                 dp->d_cspf->cp_refcount--;
  158         else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
  159                 dp->d_csp->cp_refcount--;
  160         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
  161                 dp->d_csp->cp_refcount--;
  162         kobj_delete((struct kobj*)data, M_ICONV);
  163         return 0;
  164 }
  165 
  166 static int
  167 iconv_ucs_conv(void *d2p, const char **inbuf,
  168         size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
  169         int convchar, int casetype)
  170 {
  171         struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
  172         int ret = 0, i;
  173         size_t in, on, ir, or, inlen, outlen, ucslen;
  174         const char *src, *p;
  175         char *dst;
  176         u_char ucs[4], *q;
  177         uint32_t code;
  178 
  179         if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
  180                 return 0;
  181         ir = in = *inbytesleft;
  182         or = on = *outbytesleft;
  183         src = *inbuf;
  184         dst = *outbuf;
  185 
  186         while (ir > 0 && or > 0) {
  187 
  188                 /*
  189                  * The first half of conversion.
  190                  * (convert any code into ENCODING_UNICODE)
  191                  */
  192                 code = 0;
  193                 p = src;
  194                 if (dp->convtype & KICONV_UCS_FROM_UTF8) {
  195                         /* convert UTF-8 to ENCODING_UNICODE */
  196                         inlen = 0;
  197                         code = utf8_to_ucs4(p, &inlen, ir);
  198                         if (code == 0) {
  199                                 ret = -1;
  200                                 break;
  201                         }
  202 
  203                         if (casetype == KICONV_FROM_LOWER && dp->ctype) {
  204                                 code = towlower(code, dp->ctype);
  205                         } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
  206                                 code = towupper(code, dp->ctype);
  207                         }
  208 
  209                         if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
  210                                 /* reserved for utf-16 surrogate pair */
  211                                 /* invalid unicode */
  212                                 ret = -1;
  213                                 break;
  214                         }
  215 
  216                         if (inlen == 4) {
  217                                 if (dp->convtype & KICONV_UCS_UCS4) {
  218                                         ucslen = 4;
  219                                         code = encode_surrogate(code);
  220                                 } else {
  221                                         /* can't handle with ucs-2 */
  222                                         ret = -1;
  223                                         break;
  224                                 }
  225                         } else {
  226                                 ucslen = 2;
  227                         }
  228 
  229                         /* save UCS-4 into ucs[] */
  230                         for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
  231                                 *q++ = (code >> (i << 3)) & 0xff;
  232 
  233                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
  234                         /* convert local code to ENCODING_UNICODE */
  235                         ucslen = 4;
  236                         inlen = ir;
  237                         q = ucs;
  238                         ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
  239                             &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
  240                         if (ret)
  241                                 break;
  242                         inlen = ir - inlen;
  243                         ucslen = 4 - ucslen;
  244 
  245                 } else {
  246                         /* src code is a proper subset of ENCODING_UNICODE */
  247                         q = ucs;
  248                         if (dp->convtype & KICONV_UCS_FROM_LE) {
  249                                 *q = *(p + 1);
  250                                 *(q + 1) = *p;
  251                                 p += 2;
  252                         } else {
  253                                 *q = *p++;
  254                                 *(q + 1) = *p++;
  255                         }
  256                         if ((*q & 0xfc) == 0xd8) {
  257                                 if (dp->convtype & KICONV_UCS_UCS4 &&
  258                                     dp->convtype & KICONV_UCS_FROM_UTF16) {
  259                                         inlen = ucslen = 4;
  260                                 } else {
  261                                         /* invalid unicode */
  262                                         ret = -1;
  263                                         break;
  264                                 }
  265                         } else {
  266                                 inlen = ucslen = 2;
  267                         }
  268                         if (ir < inlen) {
  269                                 ret = -1;
  270                                 break;
  271                         }
  272                         if (ucslen == 4) {
  273                                 q += 2;
  274                                 if (dp->convtype & KICONV_UCS_FROM_LE) {
  275                                         *q = *(p + 1);
  276                                         *(q + 1) = *p;
  277                                 } else {
  278                                         *q = *p++;
  279                                         *(q + 1) = *p;
  280                                 }
  281                                 if ((*q & 0xfc) != 0xdc) {
  282                                         /* invalid unicode */
  283                                         ret = -1;
  284                                         break;
  285                                 }
  286                         }
  287                 }
  288 
  289                 /*
  290                  * The second half of conversion.
  291                  * (convert ENCODING_UNICODE into any code)
  292                  */
  293                 p = ucs;
  294                 if (dp->convtype & KICONV_UCS_TO_UTF8) {
  295                         q = (u_char *)dst;
  296                         if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
  297                                 /* decode surrogate pair */
  298                                 code = decode_surrogate(p);
  299                         } else {
  300                                 code = (ucs[0] << 8) | ucs[1];
  301                         }
  302 
  303                         if (casetype == KICONV_LOWER && dp->ctype) {
  304                                 code = towlower(code, dp->ctype);
  305                         } else if (casetype == KICONV_UPPER && dp->ctype) {
  306                                 code = towupper(code, dp->ctype);
  307                         }
  308 
  309                         outlen = 0;
  310                         if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
  311                                 ret = -1;
  312                                 break;
  313                         }
  314 
  315                         src += inlen;
  316                         ir -= inlen;
  317                         dst += outlen;
  318                         or -= outlen;
  319 
  320                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
  321                         ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
  322                             &or, casetype & (KICONV_LOWER | KICONV_UPPER));
  323                         if (ret)
  324                                 break;
  325 
  326                         src += inlen;
  327                         ir -= inlen;
  328 
  329                 } else {
  330                         /* dst code is a proper subset of ENCODING_UNICODE */
  331                         if (or < ucslen) {
  332                                 ret = -1;
  333                                 break;
  334                         }
  335                         src += inlen;
  336                         ir -= inlen;
  337                         or -= ucslen;
  338                         if (dp->convtype & KICONV_UCS_TO_LE) {
  339                                 *dst++ = *(p + 1);
  340                                 *dst++ = *p;
  341                                 p += 2;
  342                         } else {
  343                                 *dst++ = *p++;
  344                                 *dst++ = *p++;
  345                         }
  346                         if (ucslen == 4) {
  347                                 if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
  348                                     (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
  349                                         ret = -1;
  350                                         break;
  351                                 }
  352                                 if (dp->convtype & KICONV_UCS_TO_LE) {
  353                                         *dst++ = *(p + 1);
  354                                         *dst++ = *p;
  355                                 } else {
  356                                         *dst++ = *p++;
  357                                         *dst++ = *p;
  358                                 }
  359                         }
  360                 }
  361 
  362                 if (convchar == 1)
  363                         break;
  364         }
  365 
  366         *inbuf += in - ir;
  367         *outbuf += on - or;
  368         *inbytesleft -= in - ir;
  369         *outbytesleft -= on - or;
  370         return (ret);
  371 }
  372 
  373 static int
  374 iconv_ucs_init(struct iconv_converter_class *dcp)
  375 {
  376         int error;
  377 
  378         error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
  379         if (error)
  380                 return (error);
  381         error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
  382         if (error)
  383                 return (error);
  384         return (0);
  385 }
  386 
  387 static int
  388 iconv_ucs_done(struct iconv_converter_class *dcp)
  389 {
  390         return (0);
  391 }
  392 
  393 static const char *
  394 iconv_ucs_name(struct iconv_converter_class *dcp)
  395 {
  396         return (ENCODING_UNICODE);
  397 }
  398 
  399 static kobj_method_t iconv_ucs_methods[] = {
  400         KOBJMETHOD(iconv_converter_open,        iconv_ucs_open),
  401         KOBJMETHOD(iconv_converter_close,       iconv_ucs_close),
  402         KOBJMETHOD(iconv_converter_conv,        iconv_ucs_conv),
  403         KOBJMETHOD(iconv_converter_init,        iconv_ucs_init),
  404         KOBJMETHOD(iconv_converter_done,        iconv_ucs_done),
  405         KOBJMETHOD(iconv_converter_name,        iconv_ucs_name),
  406         {0, 0}
  407 };
  408 
  409 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
  410 
  411 static uint32_t
  412 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
  413 {
  414         size_t i, w = 0;
  415         uint32_t ucs4 = 0;
  416 
  417         /*
  418          * get leading 1 byte from utf-8
  419          */
  420         if ((*src & 0x80) == 0) {
  421                 /*
  422                  * leading 1 bit is ""
  423                  *  utf-8: 0xxxxxxx
  424                  *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
  425                  */
  426                 w = 1;
  427                 /* get trailing 7 bits */
  428                 ucs4 = *src & 0x7f;
  429         } else if ((*src & 0xe0) == 0xc0) {
  430                 /*
  431                  * leading 3 bits are "110"
  432                  *  utf-8: 110xxxxx 10yyyyyy
  433                  *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
  434                  */
  435                 w = 2;
  436                 /* get trailing 5 bits */
  437                 ucs4 = *src & 0x1f;
  438         } else if ((*src & 0xf0) == 0xe0) {
  439                 /*
  440                  * leading 4 bits are "1110"
  441                  *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
  442                  *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
  443                  */
  444                 w = 3;
  445                 /* get trailing 4 bits */
  446                 ucs4 = *src & 0x0f;
  447         } else if ((*src & 0xf8) == 0xf0) {
  448                 /*
  449                  * leading 5 bits are "11110"
  450                  *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
  451                  *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
  452                  */
  453                 w = 4;
  454                 /* get trailing 3 bits */
  455                 ucs4 = *src & 0x07;
  456         } else {
  457                 /* out of utf-16 range or having illegal bits */
  458                 return (0);
  459         }
  460         if (w == 0)
  461                 return (0);
  462 
  463         if (srclen < w)
  464                 return (0);
  465 
  466         /*
  467          * get left parts from utf-8
  468          */
  469         for (i = 1 ; i < w ; i++) {
  470                 if ((*(src + i) & 0xc0) != 0x80) {
  471                         /* invalid: leading 2 bits are not "10" */
  472                         return (0);
  473                 }
  474                 /* concatenate trailing 6 bits into ucs4 */
  475                 ucs4 <<= 6;
  476                 ucs4 |= *(src + i) & 0x3f;
  477         }
  478 
  479         *utf8width = w;
  480         return (ucs4);
  481 }
  482 
  483 static u_char *
  484 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
  485 {
  486         u_char lead, *p;
  487         size_t i, w;
  488 
  489         /*
  490          * determine utf-8 width and leading bits
  491          */
  492         if (ucs4 < 0x80) {
  493                 w = 1;
  494                 lead = 0;       /* "" */
  495         } else if (ucs4 < 0x800) {
  496                 w = 2;
  497                 lead = 0xc0;    /* "11" */
  498         } else if (ucs4 < 0x10000) {
  499                 w = 3;
  500                 lead = 0xe0;    /* "111" */
  501         } else if (ucs4 < 0x200000) {
  502                 w = 4;
  503                 lead = 0xf0;    /* "1111" */
  504         } else {
  505                 return (NULL);
  506         }
  507 
  508         if (dstlen < w)
  509                 return (NULL);
  510 
  511         /*
  512          * construct utf-8
  513          */
  514         p = dst;
  515         for (i = w - 1 ; i >= 1 ; i--) {
  516                 /* get trailing 6 bits and put it with leading bit as "1" */
  517                 *(p + i) = (ucs4 & 0x3f) | 0x80;
  518                 ucs4 >>= 6;
  519         }
  520         *p = ucs4 | lead;
  521 
  522         *utf8width = w;
  523 
  524         return (p);
  525 }
  526 
  527 static uint32_t
  528 encode_surrogate(register uint32_t code)
  529 {
  530         return ((((code - 0x10000) << 6) & 0x3ff0000) |
  531             ((code - 0x10000) & 0x3ff) | 0xd800dc00);
  532 }
  533 
  534 static uint32_t
  535 decode_surrogate(register const u_char *ucs)
  536 {
  537         return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
  538             ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
  539 }
  540
Cache object: 8413b89f8909baf940c5c098a0e87630
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/libkern/iconv_ucs.c

FreeBSD/Linux Kernel Cross Reference
sys/libkern/iconv_ucs.c