The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/libkern/iconv_ucs.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2003, 2005 Ryuichiro Imura
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  */
   28 
   29 #include <sys/cdefs.h>
   30 __FBSDID("$FreeBSD: releng/12.0/sys/libkern/iconv_ucs.c 326271 2017-11-27 15:20:12Z pfg $");
   31 
   32 #include <sys/param.h>
   33 #include <sys/kernel.h>
   34 #include <sys/systm.h>
   35 #include <sys/malloc.h>
   36 #include <sys/iconv.h>
   37 
   38 #include "iconv_converter_if.h"
   39 
   40 /*
   41  * "UCS" converter
   42  */
   43 
   44 #define KICONV_UCS_COMBINE      0x1
   45 #define KICONV_UCS_FROM_UTF8    0x2
   46 #define KICONV_UCS_TO_UTF8      0x4
   47 #define KICONV_UCS_FROM_LE      0x8
   48 #define KICONV_UCS_TO_LE        0x10
   49 #define KICONV_UCS_FROM_UTF16   0x20
   50 #define KICONV_UCS_TO_UTF16     0x40
   51 #define KICONV_UCS_UCS4         0x80
   52 
   53 #define ENCODING_UTF16  "UTF-16BE"
   54 #define ENCODING_UTF8   "UTF-8"
   55 
   56 static struct {
   57         const char *name;
   58         int from_flag, to_flag;
   59 } unicode_family[] = {
   60         { "UTF-8",      KICONV_UCS_FROM_UTF8,   KICONV_UCS_TO_UTF8 },
   61         { "UCS-2LE",    KICONV_UCS_FROM_LE,     KICONV_UCS_TO_LE },
   62         { "UTF-16BE",   KICONV_UCS_FROM_UTF16,  KICONV_UCS_TO_UTF16 },
   63         { "UTF-16LE",   KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
   64             KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
   65         { NULL,         0,      0 }
   66 };
   67 
   68 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
   69 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
   70 static uint32_t encode_surrogate(uint32_t code);
   71 static uint32_t decode_surrogate(const u_char *ucs);
   72 
   73 #ifdef MODULE_DEPEND
   74 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
   75 #endif
   76 
   77 /*
   78  * UCS converter instance
   79  */
   80 struct iconv_ucs {
   81         KOBJ_FIELDS;
   82         int                     convtype;
   83         struct iconv_cspair *   d_csp;
   84         struct iconv_cspair *   d_cspf;
   85         void *                  f_ctp;
   86         void *                  t_ctp;
   87         void *                  ctype;
   88 };
   89 
   90 static int
   91 iconv_ucs_open(struct iconv_converter_class *dcp,
   92         struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
   93 {
   94         struct iconv_ucs *dp;
   95         int i;
   96         const char *from, *to;
   97 
   98         dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
   99         to = csp->cp_to;
  100         from = cspf ? cspf->cp_from : csp->cp_from;
  101 
  102         dp->convtype = 0;
  103 
  104         if (cspf)
  105                 dp->convtype |= KICONV_UCS_COMBINE;
  106         for (i = 0; unicode_family[i].name; i++) {
  107                 if (strcasecmp(from, unicode_family[i].name) == 0)
  108                         dp->convtype |= unicode_family[i].from_flag;
  109                 if (strcasecmp(to, unicode_family[i].name) == 0)
  110                         dp->convtype |= unicode_family[i].to_flag;
  111         }
  112         if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
  113                 dp->convtype |= KICONV_UCS_UCS4;
  114         else
  115                 dp->convtype &= ~KICONV_UCS_UCS4;
  116 
  117         dp->f_ctp = dp->t_ctp = NULL;
  118         if (dp->convtype & KICONV_UCS_COMBINE) {
  119                 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
  120                     (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
  121                         iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
  122                 }
  123                 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
  124                     (dp->convtype & KICONV_UCS_TO_LE) == 0) {
  125                         iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
  126                 }
  127         }
  128 
  129         dp->ctype = NULL;
  130         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
  131                 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
  132 
  133         dp->d_csp = csp;
  134         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
  135                 if (cspf) {
  136                         dp->d_cspf = cspf;
  137                         cspf->cp_refcount++;
  138                 } else
  139                         csp->cp_refcount++;
  140         }
  141         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
  142                 csp->cp_refcount++;
  143         *dpp = (void*)dp;
  144         return 0;
  145 }
  146 
  147 static int
  148 iconv_ucs_close(void *data)
  149 {
  150         struct iconv_ucs *dp = data;
  151 
  152         if (dp->f_ctp)
  153                 iconv_close(dp->f_ctp);
  154         if (dp->t_ctp)
  155                 iconv_close(dp->t_ctp);
  156         if (dp->ctype)
  157                 iconv_close(dp->ctype);
  158         if (dp->d_cspf)
  159                 dp->d_cspf->cp_refcount--;
  160         else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
  161                 dp->d_csp->cp_refcount--;
  162         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
  163                 dp->d_csp->cp_refcount--;
  164         kobj_delete((struct kobj*)data, M_ICONV);
  165         return 0;
  166 }
  167 
  168 static int
  169 iconv_ucs_conv(void *d2p, const char **inbuf,
  170         size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
  171         int convchar, int casetype)
  172 {
  173         struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
  174         int ret = 0, i;
  175         size_t in, on, ir, or, inlen, outlen, ucslen;
  176         const char *src, *p;
  177         char *dst;
  178         u_char ucs[4], *q;
  179         uint32_t code;
  180 
  181         if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
  182                 return 0;
  183         ir = in = *inbytesleft;
  184         or = on = *outbytesleft;
  185         src = *inbuf;
  186         dst = *outbuf;
  187 
  188         while (ir > 0 && or > 0) {
  189 
  190                 /*
  191                  * The first half of conversion.
  192                  * (convert any code into ENCODING_UNICODE)
  193                  */
  194                 code = 0;
  195                 p = src;
  196                 if (dp->convtype & KICONV_UCS_FROM_UTF8) {
  197                         /* convert UTF-8 to ENCODING_UNICODE */
  198                         inlen = 0;
  199                         code = utf8_to_ucs4(p, &inlen, ir);
  200                         if (code == 0) {
  201                                 ret = -1;
  202                                 break;
  203                         }
  204 
  205                         if (casetype == KICONV_FROM_LOWER && dp->ctype) {
  206                                 code = towlower(code, dp->ctype);
  207                         } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
  208                                 code = towupper(code, dp->ctype);
  209                         }
  210 
  211                         if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
  212                                 /* reserved for utf-16 surrogate pair */
  213                                 /* invalid unicode */
  214                                 ret = -1;
  215                                 break;
  216                         }
  217 
  218                         if (inlen == 4) {
  219                                 if (dp->convtype & KICONV_UCS_UCS4) {
  220                                         ucslen = 4;
  221                                         code = encode_surrogate(code);
  222                                 } else {
  223                                         /* can't handle with ucs-2 */
  224                                         ret = -1;
  225                                         break;
  226                                 }
  227                         } else {
  228                                 ucslen = 2;
  229                         }
  230 
  231                         /* save UCS-4 into ucs[] */
  232                         for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
  233                                 *q++ = (code >> (i << 3)) & 0xff;
  234 
  235                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
  236                         /* convert local code to ENCODING_UNICODE */
  237                         ucslen = 4;
  238                         inlen = ir;
  239                         q = ucs;
  240                         ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
  241                             &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
  242                         if (ret)
  243                                 break;
  244                         inlen = ir - inlen;
  245                         ucslen = 4 - ucslen;
  246 
  247                 } else {
  248                         /* src code is a proper subset of ENCODING_UNICODE */
  249                         q = ucs;
  250                         if (dp->convtype & KICONV_UCS_FROM_LE) {
  251                                 *q = *(p + 1);
  252                                 *(q + 1) = *p;
  253                                 p += 2;
  254                         } else {
  255                                 *q = *p++;
  256                                 *(q + 1) = *p++;
  257                         }
  258                         if ((*q & 0xfc) == 0xd8) {
  259                                 if (dp->convtype & KICONV_UCS_UCS4 &&
  260                                     dp->convtype & KICONV_UCS_FROM_UTF16) {
  261                                         inlen = ucslen = 4;
  262                                 } else {
  263                                         /* invalid unicode */
  264                                         ret = -1;
  265                                         break;
  266                                 }
  267                         } else {
  268                                 inlen = ucslen = 2;
  269                         }
  270                         if (ir < inlen) {
  271                                 ret = -1;
  272                                 break;
  273                         }
  274                         if (ucslen == 4) {
  275                                 q += 2;
  276                                 if (dp->convtype & KICONV_UCS_FROM_LE) {
  277                                         *q = *(p + 1);
  278                                         *(q + 1) = *p;
  279                                 } else {
  280                                         *q = *p++;
  281                                         *(q + 1) = *p;
  282                                 }
  283                                 if ((*q & 0xfc) != 0xdc) {
  284                                         /* invalid unicode */
  285                                         ret = -1;
  286                                         break;
  287                                 }
  288                         }
  289                 }
  290 
  291                 /*
  292                  * The second half of conversion.
  293                  * (convert ENCODING_UNICODE into any code)
  294                  */
  295                 p = ucs;
  296                 if (dp->convtype & KICONV_UCS_TO_UTF8) {
  297                         q = (u_char *)dst;
  298                         if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
  299                                 /* decode surrogate pair */
  300                                 code = decode_surrogate(p);
  301                         } else {
  302                                 code = (ucs[0] << 8) | ucs[1];
  303                         }
  304 
  305                         if (casetype == KICONV_LOWER && dp->ctype) {
  306                                 code = towlower(code, dp->ctype);
  307                         } else if (casetype == KICONV_UPPER && dp->ctype) {
  308                                 code = towupper(code, dp->ctype);
  309                         }
  310 
  311                         outlen = 0;
  312                         if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
  313                                 ret = -1;
  314                                 break;
  315                         }
  316 
  317                         src += inlen;
  318                         ir -= inlen;
  319                         dst += outlen;
  320                         or -= outlen;
  321 
  322                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
  323                         ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
  324                             &or, casetype & (KICONV_LOWER | KICONV_UPPER));
  325                         if (ret)
  326                                 break;
  327 
  328                         src += inlen;
  329                         ir -= inlen;
  330 
  331                 } else {
  332                         /* dst code is a proper subset of ENCODING_UNICODE */
  333                         if (or < ucslen) {
  334                                 ret = -1;
  335                                 break;
  336                         }
  337                         src += inlen;
  338                         ir -= inlen;
  339                         or -= ucslen;
  340                         if (dp->convtype & KICONV_UCS_TO_LE) {
  341                                 *dst++ = *(p + 1);
  342                                 *dst++ = *p;
  343                                 p += 2;
  344                         } else {
  345                                 *dst++ = *p++;
  346                                 *dst++ = *p++;
  347                         }
  348                         if (ucslen == 4) {
  349                                 if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
  350                                     (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
  351                                         ret = -1;
  352                                         break;
  353                                 }
  354                                 if (dp->convtype & KICONV_UCS_TO_LE) {
  355                                         *dst++ = *(p + 1);
  356                                         *dst++ = *p;
  357                                 } else {
  358                                         *dst++ = *p++;
  359                                         *dst++ = *p;
  360                                 }
  361                         }
  362                 }
  363 
  364                 if (convchar == 1)
  365                         break;
  366         }
  367 
  368         *inbuf += in - ir;
  369         *outbuf += on - or;
  370         *inbytesleft -= in - ir;
  371         *outbytesleft -= on - or;
  372         return (ret);
  373 }
  374 
  375 static int
  376 iconv_ucs_init(struct iconv_converter_class *dcp)
  377 {
  378         int error;
  379 
  380         error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
  381         if (error)
  382                 return (error);
  383         error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
  384         if (error)
  385                 return (error);
  386         return (0);
  387 }
  388 
  389 static int
  390 iconv_ucs_done(struct iconv_converter_class *dcp)
  391 {
  392         return (0);
  393 }
  394 
  395 static const char *
  396 iconv_ucs_name(struct iconv_converter_class *dcp)
  397 {
  398         return (ENCODING_UNICODE);
  399 }
  400 
  401 static kobj_method_t iconv_ucs_methods[] = {
  402         KOBJMETHOD(iconv_converter_open,        iconv_ucs_open),
  403         KOBJMETHOD(iconv_converter_close,       iconv_ucs_close),
  404         KOBJMETHOD(iconv_converter_conv,        iconv_ucs_conv),
  405         KOBJMETHOD(iconv_converter_init,        iconv_ucs_init),
  406         KOBJMETHOD(iconv_converter_done,        iconv_ucs_done),
  407         KOBJMETHOD(iconv_converter_name,        iconv_ucs_name),
  408         {0, 0}
  409 };
  410 
  411 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
  412 
  413 static uint32_t
  414 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
  415 {
  416         size_t i, w = 0;
  417         uint32_t ucs4 = 0;
  418 
  419         /*
  420          * get leading 1 byte from utf-8
  421          */
  422         if ((*src & 0x80) == 0) {
  423                 /*
  424                  * leading 1 bit is ""
  425                  *  utf-8: 0xxxxxxx
  426                  *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
  427                  */
  428                 w = 1;
  429                 /* get trailing 7 bits */
  430                 ucs4 = *src & 0x7f;
  431         } else if ((*src & 0xe0) == 0xc0) {
  432                 /*
  433                  * leading 3 bits are "110"
  434                  *  utf-8: 110xxxxx 10yyyyyy
  435                  *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
  436                  */
  437                 w = 2;
  438                 /* get trailing 5 bits */
  439                 ucs4 = *src & 0x1f;
  440         } else if ((*src & 0xf0) == 0xe0) {
  441                 /*
  442                  * leading 4 bits are "1110"
  443                  *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
  444                  *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
  445                  */
  446                 w = 3;
  447                 /* get trailing 4 bits */
  448                 ucs4 = *src & 0x0f;
  449         } else if ((*src & 0xf8) == 0xf0) {
  450                 /*
  451                  * leading 5 bits are "11110"
  452                  *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
  453                  *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
  454                  */
  455                 w = 4;
  456                 /* get trailing 3 bits */
  457                 ucs4 = *src & 0x07;
  458         } else {
  459                 /* out of utf-16 range or having illegal bits */
  460                 return (0);
  461         }
  462 
  463         if (srclen < w)
  464                 return (0);
  465 
  466         /*
  467          * get left parts from utf-8
  468          */
  469         for (i = 1 ; i < w ; i++) {
  470                 if ((*(src + i) & 0xc0) != 0x80) {
  471                         /* invalid: leading 2 bits are not "10" */
  472                         return (0);
  473                 }
  474                 /* concatenate trailing 6 bits into ucs4 */
  475                 ucs4 <<= 6;
  476                 ucs4 |= *(src + i) & 0x3f;
  477         }
  478 
  479         *utf8width = w;
  480         return (ucs4);
  481 }
  482 
  483 static u_char *
  484 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
  485 {
  486         u_char lead, *p;
  487         size_t i, w;
  488 
  489         /*
  490          * determine utf-8 width and leading bits
  491          */
  492         if (ucs4 < 0x80) {
  493                 w = 1;
  494                 lead = 0;       /* "" */
  495         } else if (ucs4 < 0x800) {
  496                 w = 2;
  497                 lead = 0xc0;    /* "11" */
  498         } else if (ucs4 < 0x10000) {
  499                 w = 3;
  500                 lead = 0xe0;    /* "111" */
  501         } else if (ucs4 < 0x200000) {
  502                 w = 4;
  503                 lead = 0xf0;    /* "1111" */
  504         } else {
  505                 return (NULL);
  506         }
  507 
  508         if (dstlen < w)
  509                 return (NULL);
  510 
  511         /*
  512          * construct utf-8
  513          */
  514         p = dst;
  515         for (i = w - 1 ; i >= 1 ; i--) {
  516                 /* get trailing 6 bits and put it with leading bit as "1" */
  517                 *(p + i) = (ucs4 & 0x3f) | 0x80;
  518                 ucs4 >>= 6;
  519         }
  520         *p = ucs4 | lead;
  521 
  522         *utf8width = w;
  523 
  524         return (p);
  525 }
  526 
  527 static uint32_t
  528 encode_surrogate(uint32_t code)
  529 {
  530         return ((((code - 0x10000) << 6) & 0x3ff0000) |
  531             ((code - 0x10000) & 0x3ff) | 0xd800dc00);
  532 }
  533 
  534 static uint32_t
  535 decode_surrogate(const u_char *ucs)
  536 {
  537         return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
  538             ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
  539 }
  540 

Cache object: a0e9135d976a05c4f96d6c2c08dec00e


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.