The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/unicode/uconv.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
   23  * Use is subject to license terms.
   24  */
   25 
   26 
   27 
   28 /*
   29  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
   30  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
   31  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
   32  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
   33  * the section 3C man pages.
   34  * Interface stability: Committed
   35  */
   36 
   37 #include <sys/types.h>
   38 #ifdef  _KERNEL
   39 #include <sys/param.h>
   40 #include <sys/sysmacros.h>
   41 #include <sys/debug.h>
   42 #include <sys/kmem.h>
   43 #include <sys/sunddi.h>
   44 #else
   45 #include <sys/u8_textprep.h>
   46 #endif  /* _KERNEL */
   47 #include <sys/byteorder.h>
   48 #include <sys/errno.h>
   49 
   50 
   51 /*
   52  * The max and min values of high and low surrogate pairs of UTF-16,
   53  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
   54  */
   55 #define UCONV_U16_HI_MIN        (0xd800U)
   56 #define UCONV_U16_HI_MAX        (0xdbffU)
   57 #define UCONV_U16_LO_MIN        (0xdc00U)
   58 #define UCONV_U16_LO_MAX        (0xdfffU)
   59 #define UCONV_U16_BIT_SHIFT     (0x0400U)
   60 #define UCONV_U16_BIT_MASK      (0x0fffffU)
   61 #define UCONV_U16_START         (0x010000U)
   62 
   63 /* The maximum value of Unicode coding space and ASCII coding space. */
   64 #define UCONV_UNICODE_MAX       (0x10ffffU)
   65 #define UCONV_ASCII_MAX         (0x7fU)
   66 
   67 /* The mask values for input and output endians. */
   68 #define UCONV_IN_ENDIAN_MASKS   (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
   69 #define UCONV_OUT_ENDIAN_MASKS  (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
   70 
   71 /* Native and reversed endian macros. */
   72 #ifdef  _ZFS_BIG_ENDIAN
   73 #define UCONV_IN_NAT_ENDIAN     UCONV_IN_BIG_ENDIAN
   74 #define UCONV_IN_REV_ENDIAN     UCONV_IN_LITTLE_ENDIAN
   75 #define UCONV_OUT_NAT_ENDIAN    UCONV_OUT_BIG_ENDIAN
   76 #define UCONV_OUT_REV_ENDIAN    UCONV_OUT_LITTLE_ENDIAN
   77 #else
   78 #define UCONV_IN_NAT_ENDIAN     UCONV_IN_LITTLE_ENDIAN
   79 #define UCONV_IN_REV_ENDIAN     UCONV_IN_BIG_ENDIAN
   80 #define UCONV_OUT_NAT_ENDIAN    UCONV_OUT_LITTLE_ENDIAN
   81 #define UCONV_OUT_REV_ENDIAN    UCONV_OUT_BIG_ENDIAN
   82 #endif  /* _BIG_ENDIAN */
   83 
   84 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
   85 #define UCONV_BOM_NORMAL        (0xfeffU)
   86 #define UCONV_BOM_SWAPPED       (0xfffeU)
   87 #define UCONV_BOM_SWAPPED_32    (0xfffe0000U)
   88 
   89 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
   90 #define UCONV_U8_ONE_BYTE       (0x7fU)
   91 #define UCONV_U8_TWO_BYTES      (0x7ffU)
   92 #define UCONV_U8_THREE_BYTES    (0xffffU)
   93 #define UCONV_U8_FOUR_BYTES     (0x10ffffU)
   94 
   95 /* The common minimum and maximum values at the UTF-8 character bytes. */
   96 #define UCONV_U8_BYTE_MIN       (0x80U)
   97 #define UCONV_U8_BYTE_MAX       (0xbfU)
   98 
   99 /*
  100  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
  101  * UTF-8 character bytes.
  102  */
  103 #define UCONV_U8_BIT_SHIFT      6
  104 #define UCONV_U8_BIT_MASK       0x3f
  105 
  106 /*
  107  * The following vector shows remaining bytes in a UTF-8 character.
  108  * Index will be the first byte of the character.
  109  */
  110 static const uchar_t remaining_bytes_tbl[0x100] = {
  111         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  112         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  113         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  114         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  115         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  116         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  117         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  118         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  119         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  120         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  121         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  122         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  123 
  124 /*      C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
  125         0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
  126 
  127 /*      D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
  128         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
  129 
  130 /*      E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
  131         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
  132 
  133 /*      F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
  134         3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
  135 };
  136 
  137 /*
  138  * The following is a vector of bit-masks to get used bits in
  139  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
  140  * the character.
  141  */
  142 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  143 
  144 /*
  145  * The following two vectors are to provide valid minimum and
  146  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
  147  * better illegal sequence checking. The index value must be the value of
  148  * the first byte of the UTF-8 character.
  149  */
  150 static const uchar_t valid_min_2nd_byte[0x100] = {
  151         0,    0,    0,    0,    0,    0,    0,    0,
  152         0,    0,    0,    0,    0,    0,    0,    0,
  153         0,    0,    0,    0,    0,    0,    0,    0,
  154         0,    0,    0,    0,    0,    0,    0,    0,
  155         0,    0,    0,    0,    0,    0,    0,    0,
  156         0,    0,    0,    0,    0,    0,    0,    0,
  157         0,    0,    0,    0,    0,    0,    0,    0,
  158         0,    0,    0,    0,    0,    0,    0,    0,
  159         0,    0,    0,    0,    0,    0,    0,    0,
  160         0,    0,    0,    0,    0,    0,    0,    0,
  161         0,    0,    0,    0,    0,    0,    0,    0,
  162         0,    0,    0,    0,    0,    0,    0,    0,
  163         0,    0,    0,    0,    0,    0,    0,    0,
  164         0,    0,    0,    0,    0,    0,    0,    0,
  165         0,    0,    0,    0,    0,    0,    0,    0,
  166         0,    0,    0,    0,    0,    0,    0,    0,
  167         0,    0,    0,    0,    0,    0,    0,    0,
  168         0,    0,    0,    0,    0,    0,    0,    0,
  169         0,    0,    0,    0,    0,    0,    0,    0,
  170         0,    0,    0,    0,    0,    0,    0,    0,
  171         0,    0,    0,    0,    0,    0,    0,    0,
  172         0,    0,    0,    0,    0,    0,    0,    0,
  173         0,    0,    0,    0,    0,    0,    0,    0,
  174         0,    0,    0,    0,    0,    0,    0,    0,
  175 
  176 /*      C0    C1    C2    C3    C4    C5    C6    C7 */
  177         0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  178 
  179 /*      C8    C9    CA    CB    CC    CD    CE    CF */
  180         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  181 
  182 /*      D0    D1    D2    D3    D4    D5    D6    D7 */
  183         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  184 
  185 /*      D8    D9    DA    DB    DC    DD    DE    DF */
  186         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  187 
  188 /*      E0    E1    E2    E3    E4    E5    E6    E7 */
  189         0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  190 
  191 /*      E8    E9    EA    EB    EC    ED    EE    EF */
  192         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  193 
  194 /*      F0    F1    F2    F3    F4    F5    F6    F7 */
  195         0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
  196 
  197         0,    0,    0,    0,    0,    0,    0,    0
  198 };
  199 
  200 static const uchar_t valid_max_2nd_byte[0x100] = {
  201         0,    0,    0,    0,    0,    0,    0,    0,
  202         0,    0,    0,    0,    0,    0,    0,    0,
  203         0,    0,    0,    0,    0,    0,    0,    0,
  204         0,    0,    0,    0,    0,    0,    0,    0,
  205         0,    0,    0,    0,    0,    0,    0,    0,
  206         0,    0,    0,    0,    0,    0,    0,    0,
  207         0,    0,    0,    0,    0,    0,    0,    0,
  208         0,    0,    0,    0,    0,    0,    0,    0,
  209         0,    0,    0,    0,    0,    0,    0,    0,
  210         0,    0,    0,    0,    0,    0,    0,    0,
  211         0,    0,    0,    0,    0,    0,    0,    0,
  212         0,    0,    0,    0,    0,    0,    0,    0,
  213         0,    0,    0,    0,    0,    0,    0,    0,
  214         0,    0,    0,    0,    0,    0,    0,    0,
  215         0,    0,    0,    0,    0,    0,    0,    0,
  216         0,    0,    0,    0,    0,    0,    0,    0,
  217         0,    0,    0,    0,    0,    0,    0,    0,
  218         0,    0,    0,    0,    0,    0,    0,    0,
  219         0,    0,    0,    0,    0,    0,    0,    0,
  220         0,    0,    0,    0,    0,    0,    0,    0,
  221         0,    0,    0,    0,    0,    0,    0,    0,
  222         0,    0,    0,    0,    0,    0,    0,    0,
  223         0,    0,    0,    0,    0,    0,    0,    0,
  224         0,    0,    0,    0,    0,    0,    0,    0,
  225 
  226 /*      C0    C1    C2    C3    C4    C5    C6    C7 */
  227         0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
  228 
  229 /*      C8    C9    CA    CB    CC    CD    CE    CF */
  230         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
  231 
  232 /*      D0    D1    D2    D3    D4    D5    D6    D7 */
  233         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
  234 
  235 /*      D8    D9    DA    DB    DC    DD    DE    DF */
  236         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
  237 
  238 /*      E0    E1    E2    E3    E4    E5    E6    E7 */
  239         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
  240 
  241 /*      E8    E9    EA    EB    EC    ED    EE    EF */
  242         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
  243 
  244 /*      F0    F1    F2    F3    F4    F5    F6    F7 */
  245         0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
  246 
  247         0,    0,    0,    0,    0,    0,    0,    0
  248 };
  249 
  250 
  251 static int
  252 check_endian(int flag, int *in, int *out)
  253 {
  254         *in = flag & UCONV_IN_ENDIAN_MASKS;
  255 
  256         /* You cannot have both. */
  257         if (*in == UCONV_IN_ENDIAN_MASKS)
  258                 return (EBADF);
  259 
  260         if (*in == 0)
  261                 *in = UCONV_IN_NAT_ENDIAN;
  262 
  263         *out = flag & UCONV_OUT_ENDIAN_MASKS;
  264 
  265         /* You cannot have both. */
  266         if (*out == UCONV_OUT_ENDIAN_MASKS)
  267                 return (EBADF);
  268 
  269         if (*out == 0)
  270                 *out = UCONV_OUT_NAT_ENDIAN;
  271 
  272         return (0);
  273 }
  274 
  275 static boolean_t
  276 check_bom16(const uint16_t *u16s, size_t u16l, int *in)
  277 {
  278         if (u16l > 0) {
  279                 if (*u16s == UCONV_BOM_NORMAL) {
  280                         *in = UCONV_IN_NAT_ENDIAN;
  281                         return (B_TRUE);
  282                 }
  283                 if (*u16s == UCONV_BOM_SWAPPED) {
  284                         *in = UCONV_IN_REV_ENDIAN;
  285                         return (B_TRUE);
  286                 }
  287         }
  288 
  289         return (B_FALSE);
  290 }
  291 
  292 static boolean_t
  293 check_bom32(const uint32_t *u32s, size_t u32l, int *in)
  294 {
  295         if (u32l > 0) {
  296                 if (*u32s == UCONV_BOM_NORMAL) {
  297                         *in = UCONV_IN_NAT_ENDIAN;
  298                         return (B_TRUE);
  299                 }
  300                 if (*u32s == UCONV_BOM_SWAPPED_32) {
  301                         *in = UCONV_IN_REV_ENDIAN;
  302                         return (B_TRUE);
  303                 }
  304         }
  305 
  306         return (B_FALSE);
  307 }
  308 
  309 int
  310 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
  311     uint32_t *u32s, size_t *utf32len, int flag)
  312 {
  313         int inendian;
  314         int outendian;
  315         size_t u16l;
  316         size_t u32l;
  317         uint32_t hi;
  318         uint32_t lo;
  319         boolean_t do_not_ignore_null;
  320 
  321         /*
  322          * Do preliminary validity checks on parameters and collect info on
  323          * endians.
  324          */
  325         if (u16s == NULL || utf16len == NULL)
  326                 return (EILSEQ);
  327 
  328         if (u32s == NULL || utf32len == NULL)
  329                 return (E2BIG);
  330 
  331         if (check_endian(flag, &inendian, &outendian) != 0)
  332                 return (EBADF);
  333 
  334         /*
  335          * Initialize input and output parameter buffer indices and
  336          * temporary variables.
  337          */
  338         u16l = u32l = 0;
  339         hi = 0;
  340         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
  341 
  342         /*
  343          * Check on the BOM at the beginning of the input buffer if required
  344          * and if there is indeed one, process it.
  345          */
  346         if ((flag & UCONV_IN_ACCEPT_BOM) &&
  347             check_bom16(u16s, *utf16len, &inendian))
  348                 u16l++;
  349 
  350         /*
  351          * Reset inendian and outendian so that after this point, those can be
  352          * used as condition values.
  353          */
  354         inendian &= UCONV_IN_NAT_ENDIAN;
  355         outendian &= UCONV_OUT_NAT_ENDIAN;
  356 
  357         /*
  358          * If there is something in the input buffer and if necessary and
  359          * requested, save the BOM at the output buffer.
  360          */
  361         if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
  362                 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
  363                     UCONV_BOM_SWAPPED_32;
  364 
  365         /*
  366          * Do conversion; if encounter a surrogate pair, assemble high and
  367          * low pair values to form a UTF-32 character. If a half of a pair
  368          * exists alone, then, either it is an illegal (EILSEQ) or
  369          * invalid (EINVAL) value.
  370          */
  371         for (; u16l < *utf16len; u16l++) {
  372                 if (u16s[u16l] == 0 && do_not_ignore_null)
  373                         break;
  374 
  375                 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
  376 
  377                 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
  378                         if (hi)
  379                                 return (EILSEQ);
  380                         hi = lo;
  381                         continue;
  382                 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
  383                         if (! hi)
  384                                 return (EILSEQ);
  385                         lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
  386                             lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
  387                             + UCONV_U16_START;
  388                         hi = 0;
  389                 } else if (hi) {
  390                         return (EILSEQ);
  391                 }
  392 
  393                 if (u32l >= *utf32len)
  394                         return (E2BIG);
  395 
  396                 u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
  397         }
  398 
  399         /*
  400          * If high half didn't see low half, then, it's most likely the input
  401          * parameter is incomplete.
  402          */
  403         if (hi)
  404                 return (EINVAL);
  405 
  406         /*
  407          * Save the number of consumed and saved characters. They do not
  408          * include terminating NULL character (U+0000) at the end of
  409          * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
  410          * the input buffer length is big enough to include the terminating
  411          * NULL character).
  412          */
  413         *utf16len = u16l;
  414         *utf32len = u32l;
  415 
  416         return (0);
  417 }
  418 
  419 int
  420 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
  421     uchar_t *u8s, size_t *utf8len, int flag)
  422 {
  423         int inendian;
  424         int outendian;
  425         size_t u16l;
  426         size_t u8l;
  427         uint32_t hi;
  428         uint32_t lo;
  429         boolean_t do_not_ignore_null;
  430 
  431         if (u16s == NULL || utf16len == NULL)
  432                 return (EILSEQ);
  433 
  434         if (u8s == NULL || utf8len == NULL)
  435                 return (E2BIG);
  436 
  437         if (check_endian(flag, &inendian, &outendian) != 0)
  438                 return (EBADF);
  439 
  440         u16l = u8l = 0;
  441         hi = 0;
  442         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
  443 
  444         if ((flag & UCONV_IN_ACCEPT_BOM) &&
  445             check_bom16(u16s, *utf16len, &inendian))
  446                 u16l++;
  447 
  448         inendian &= UCONV_IN_NAT_ENDIAN;
  449 
  450         for (; u16l < *utf16len; u16l++) {
  451                 if (u16s[u16l] == 0 && do_not_ignore_null)
  452                         break;
  453 
  454                 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
  455 
  456                 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
  457                         if (hi)
  458                                 return (EILSEQ);
  459                         hi = lo;
  460                         continue;
  461                 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
  462                         if (! hi)
  463                                 return (EILSEQ);
  464                         lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
  465                             lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
  466                             + UCONV_U16_START;
  467                         hi = 0;
  468                 } else if (hi) {
  469                         return (EILSEQ);
  470                 }
  471 
  472                 /*
  473                  * Now we convert a UTF-32 character into a UTF-8 character.
  474                  * Unicode coding space is between U+0000 and U+10FFFF;
  475                  * anything bigger is an illegal character.
  476                  */
  477                 if (lo <= UCONV_U8_ONE_BYTE) {
  478                         if (u8l >= *utf8len)
  479                                 return (E2BIG);
  480                         u8s[u8l++] = (uchar_t)lo;
  481                 } else if (lo <= UCONV_U8_TWO_BYTES) {
  482                         if ((u8l + 1) >= *utf8len)
  483                                 return (E2BIG);
  484                         u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
  485                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
  486                 } else if (lo <= UCONV_U8_THREE_BYTES) {
  487                         if ((u8l + 2) >= *utf8len)
  488                                 return (E2BIG);
  489                         u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
  490                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
  491                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
  492                 } else if (lo <= UCONV_U8_FOUR_BYTES) {
  493                         if ((u8l + 3) >= *utf8len)
  494                                 return (E2BIG);
  495                         u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
  496                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
  497                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
  498                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
  499                 } else {
  500                         return (EILSEQ);
  501                 }
  502         }
  503 
  504         if (hi)
  505                 return (EINVAL);
  506 
  507         *utf16len = u16l;
  508         *utf8len = u8l;
  509 
  510         return (0);
  511 }
  512 
  513 int
  514 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
  515     uint16_t *u16s, size_t *utf16len, int flag)
  516 {
  517         int inendian;
  518         int outendian;
  519         size_t u16l;
  520         size_t u32l;
  521         uint32_t hi;
  522         uint32_t lo;
  523         boolean_t do_not_ignore_null;
  524 
  525         if (u32s == NULL || utf32len == NULL)
  526                 return (EILSEQ);
  527 
  528         if (u16s == NULL || utf16len == NULL)
  529                 return (E2BIG);
  530 
  531         if (check_endian(flag, &inendian, &outendian) != 0)
  532                 return (EBADF);
  533 
  534         u16l = u32l = 0;
  535         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
  536 
  537         if ((flag & UCONV_IN_ACCEPT_BOM) &&
  538             check_bom32(u32s, *utf32len, &inendian))
  539                 u32l++;
  540 
  541         inendian &= UCONV_IN_NAT_ENDIAN;
  542         outendian &= UCONV_OUT_NAT_ENDIAN;
  543 
  544         if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
  545                 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
  546                     UCONV_BOM_SWAPPED;
  547 
  548         for (; u32l < *utf32len; u32l++) {
  549                 if (u32s[u32l] == 0 && do_not_ignore_null)
  550                         break;
  551 
  552                 hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
  553 
  554                 /*
  555                  * Anything bigger than the Unicode coding space, i.e.,
  556                  * Unicode scalar value bigger than U+10FFFF, is an illegal
  557                  * character.
  558                  */
  559                 if (hi > UCONV_UNICODE_MAX)
  560                         return (EILSEQ);
  561 
  562                 /*
  563                  * Anything bigger than U+FFFF must be converted into
  564                  * a surrogate pair in UTF-16.
  565                  */
  566                 if (hi >= UCONV_U16_START) {
  567                         lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
  568                             UCONV_U16_LO_MIN;
  569                         hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
  570                             UCONV_U16_HI_MIN;
  571 
  572                         if ((u16l + 1) >= *utf16len)
  573                                 return (E2BIG);
  574 
  575                         if (outendian) {
  576                                 u16s[u16l++] = (uint16_t)hi;
  577                                 u16s[u16l++] = (uint16_t)lo;
  578                         } else {
  579                                 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
  580                                 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
  581                         }
  582                 } else {
  583                         if (u16l >= *utf16len)
  584                                 return (E2BIG);
  585                         u16s[u16l++] = (outendian) ? (uint16_t)hi :
  586                             BSWAP_16(((uint16_t)hi));
  587                 }
  588         }
  589 
  590         *utf16len = u16l;
  591         *utf32len = u32l;
  592 
  593         return (0);
  594 }
  595 
  596 int
  597 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
  598     uchar_t *u8s, size_t *utf8len, int flag)
  599 {
  600         int inendian;
  601         int outendian;
  602         size_t u32l;
  603         size_t u8l;
  604         uint32_t lo;
  605         boolean_t do_not_ignore_null;
  606 
  607         if (u32s == NULL || utf32len == NULL)
  608                 return (EILSEQ);
  609 
  610         if (u8s == NULL || utf8len == NULL)
  611                 return (E2BIG);
  612 
  613         if (check_endian(flag, &inendian, &outendian) != 0)
  614                 return (EBADF);
  615 
  616         u32l = u8l = 0;
  617         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
  618 
  619         if ((flag & UCONV_IN_ACCEPT_BOM) &&
  620             check_bom32(u32s, *utf32len, &inendian))
  621                 u32l++;
  622 
  623         inendian &= UCONV_IN_NAT_ENDIAN;
  624 
  625         for (; u32l < *utf32len; u32l++) {
  626                 if (u32s[u32l] == 0 && do_not_ignore_null)
  627                         break;
  628 
  629                 lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
  630 
  631                 if (lo <= UCONV_U8_ONE_BYTE) {
  632                         if (u8l >= *utf8len)
  633                                 return (E2BIG);
  634                         u8s[u8l++] = (uchar_t)lo;
  635                 } else if (lo <= UCONV_U8_TWO_BYTES) {
  636                         if ((u8l + 1) >= *utf8len)
  637                                 return (E2BIG);
  638                         u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
  639                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
  640                 } else if (lo <= UCONV_U8_THREE_BYTES) {
  641                         if ((u8l + 2) >= *utf8len)
  642                                 return (E2BIG);
  643                         u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
  644                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
  645                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
  646                 } else if (lo <= UCONV_U8_FOUR_BYTES) {
  647                         if ((u8l + 3) >= *utf8len)
  648                                 return (E2BIG);
  649                         u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
  650                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
  651                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
  652                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
  653                 } else {
  654                         return (EILSEQ);
  655                 }
  656         }
  657 
  658         *utf32len = u32l;
  659         *utf8len = u8l;
  660 
  661         return (0);
  662 }
  663 
  664 int
  665 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
  666     uint16_t *u16s, size_t *utf16len, int flag)
  667 {
  668         int inendian;
  669         int outendian;
  670         size_t u16l;
  671         size_t u8l;
  672         uint32_t hi;
  673         uint32_t lo;
  674         int remaining_bytes;
  675         int first_b;
  676         boolean_t do_not_ignore_null;
  677 
  678         if (u8s == NULL || utf8len == NULL)
  679                 return (EILSEQ);
  680 
  681         if (u16s == NULL || utf16len == NULL)
  682                 return (E2BIG);
  683 
  684         if (check_endian(flag, &inendian, &outendian) != 0)
  685                 return (EBADF);
  686 
  687         u16l = u8l = 0;
  688         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
  689 
  690         outendian &= UCONV_OUT_NAT_ENDIAN;
  691 
  692         if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
  693                 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
  694                     UCONV_BOM_SWAPPED;
  695 
  696         for (; u8l < *utf8len; ) {
  697                 if (u8s[u8l] == 0 && do_not_ignore_null)
  698                         break;
  699 
  700                 /*
  701                  * Collect a UTF-8 character and convert it to a UTF-32
  702                  * character. In doing so, we screen out illegally formed
  703                  * UTF-8 characters and treat such as illegal characters.
  704                  * The algorithm at below also screens out anything bigger
  705                  * than the U+10FFFF.
  706                  *
  707                  * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
  708                  * more details on the illegal values of UTF-8 character
  709                  * bytes.
  710                  */
  711                 hi = (uint32_t)u8s[u8l++];
  712 
  713                 if (hi > UCONV_ASCII_MAX) {
  714                         if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
  715                                 return (EILSEQ);
  716 
  717                         first_b = hi;
  718                         hi = hi & u8_masks_tbl[remaining_bytes];
  719 
  720                         for (; remaining_bytes > 0; remaining_bytes--) {
  721                                 /*
  722                                  * If we have no more bytes, the current
  723                                  * UTF-8 character is incomplete.
  724                                  */
  725                                 if (u8l >= *utf8len)
  726                                         return (EINVAL);
  727 
  728                                 lo = (uint32_t)u8s[u8l++];
  729 
  730                                 if (first_b) {
  731                                         if (lo < valid_min_2nd_byte[first_b] ||
  732                                             lo > valid_max_2nd_byte[first_b])
  733                                                 return (EILSEQ);
  734                                         first_b = 0;
  735                                 } else if (lo < UCONV_U8_BYTE_MIN ||
  736                                     lo > UCONV_U8_BYTE_MAX) {
  737                                         return (EILSEQ);
  738                                 }
  739                                 hi = (hi << UCONV_U8_BIT_SHIFT) |
  740                                     (lo & UCONV_U8_BIT_MASK);
  741                         }
  742                 }
  743 
  744                 if (hi >= UCONV_U16_START) {
  745                         lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
  746                             UCONV_U16_LO_MIN;
  747                         hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
  748                             UCONV_U16_HI_MIN;
  749 
  750                         if ((u16l + 1) >= *utf16len)
  751                                 return (E2BIG);
  752 
  753                         if (outendian) {
  754                                 u16s[u16l++] = (uint16_t)hi;
  755                                 u16s[u16l++] = (uint16_t)lo;
  756                         } else {
  757                                 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
  758                                 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
  759                         }
  760                 } else {
  761                         if (u16l >= *utf16len)
  762                                 return (E2BIG);
  763 
  764                         u16s[u16l++] = (outendian) ? (uint16_t)hi :
  765                             BSWAP_16(((uint16_t)hi));
  766                 }
  767         }
  768 
  769         *utf16len = u16l;
  770         *utf8len = u8l;
  771 
  772         return (0);
  773 }
  774 
  775 int
  776 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
  777     uint32_t *u32s, size_t *utf32len, int flag)
  778 {
  779         int inendian;
  780         int outendian;
  781         size_t u32l;
  782         size_t u8l;
  783         uint32_t hi;
  784         uint32_t c;
  785         int remaining_bytes;
  786         int first_b;
  787         boolean_t do_not_ignore_null;
  788 
  789         if (u8s == NULL || utf8len == NULL)
  790                 return (EILSEQ);
  791 
  792         if (u32s == NULL || utf32len == NULL)
  793                 return (E2BIG);
  794 
  795         if (check_endian(flag, &inendian, &outendian) != 0)
  796                 return (EBADF);
  797 
  798         u32l = u8l = 0;
  799         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
  800 
  801         outendian &= UCONV_OUT_NAT_ENDIAN;
  802 
  803         if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
  804                 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
  805                     UCONV_BOM_SWAPPED_32;
  806 
  807         for (; u8l < *utf8len; ) {
  808                 if (u8s[u8l] == 0 && do_not_ignore_null)
  809                         break;
  810 
  811                 hi = (uint32_t)u8s[u8l++];
  812 
  813                 if (hi > UCONV_ASCII_MAX) {
  814                         if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
  815                                 return (EILSEQ);
  816 
  817                         first_b = hi;
  818                         hi = hi & u8_masks_tbl[remaining_bytes];
  819 
  820                         for (; remaining_bytes > 0; remaining_bytes--) {
  821                                 if (u8l >= *utf8len)
  822                                         return (EINVAL);
  823 
  824                                 c = (uint32_t)u8s[u8l++];
  825 
  826                                 if (first_b) {
  827                                         if (c < valid_min_2nd_byte[first_b] ||
  828                                             c > valid_max_2nd_byte[first_b])
  829                                                 return (EILSEQ);
  830                                         first_b = 0;
  831                                 } else if (c < UCONV_U8_BYTE_MIN ||
  832                                     c > UCONV_U8_BYTE_MAX) {
  833                                         return (EILSEQ);
  834                                 }
  835                                 hi = (hi << UCONV_U8_BIT_SHIFT) |
  836                                     (c & UCONV_U8_BIT_MASK);
  837                         }
  838                 }
  839 
  840                 if (u32l >= *utf32len)
  841                         return (E2BIG);
  842 
  843                 u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
  844         }
  845 
  846         *utf32len = u32l;
  847         *utf8len = u8l;
  848 
  849         return (0);
  850 }
  851 
  852 #if defined(_KERNEL)
  853 EXPORT_SYMBOL(uconv_u16tou32);
  854 EXPORT_SYMBOL(uconv_u16tou8);
  855 EXPORT_SYMBOL(uconv_u32tou16);
  856 EXPORT_SYMBOL(uconv_u32tou8);
  857 EXPORT_SYMBOL(uconv_u8tou16);
  858 EXPORT_SYMBOL(uconv_u8tou32);
  859 #endif

Cache object: e8249c433b284ef07aa7cc292ba7a589


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.