| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| /* |
| * Copyright 2008 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| */ |
| |
| #pragma ident "%Z%%M% %I% %E% SMI" |
| |
| /* |
| * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32. |
| * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517) |
| * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F), |
| * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also |
| * the section 3C man pages. |
| * Interface stability: Committed |
| */ |
| |
| #include <sys/types.h> |
| #ifdef _KERNEL |
| #include <sys/param.h> |
| #include <sys/sysmacros.h> |
| #include <sys/systm.h> |
| #include <sys/debug.h> |
| #include <sys/kmem.h> |
| #include <sys/sunddi.h> |
| #else |
| #include <sys/u8_textprep.h> |
| #endif /* _KERNEL */ |
| #include <sys/byteorder.h> |
| #include <sys/errno.h> |
| |
| |
| /* |
| * The max and min values of high and low surrogate pairs of UTF-16, |
| * UTF-16 bit shift value, bit mask, and starting value outside of BMP. |
| */ |
| #define UCONV_U16_HI_MIN (0xd800U) |
| #define UCONV_U16_HI_MAX (0xdbffU) |
| #define UCONV_U16_LO_MIN (0xdc00U) |
| #define UCONV_U16_LO_MAX (0xdfffU) |
| #define UCONV_U16_BIT_SHIFT (0x0400U) |
| #define UCONV_U16_BIT_MASK (0x0fffffU) |
| #define UCONV_U16_START (0x010000U) |
| |
| /* The maximum value of Unicode coding space and ASCII coding space. */ |
| #define UCONV_UNICODE_MAX (0x10ffffU) |
| #define UCONV_ASCII_MAX (0x7fU) |
| |
| /* The mask values for input and output endians. */ |
| #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN) |
| #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) |
| |
| /* Native and reversed endian macros. */ |
| #ifdef _BIG_ENDIAN |
| #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN |
| #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN |
| #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN |
| #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN |
| #else |
| #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN |
| #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN |
| #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN |
| #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN |
| #endif /* _BIG_ENDIAN */ |
| |
| /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */ |
| #define UCONV_BOM_NORMAL (0xfeffU) |
| #define UCONV_BOM_SWAPPED (0xfffeU) |
| #define UCONV_BOM_SWAPPED_32 (0xfffe0000U) |
| |
| /* UTF-32 boundaries based on UTF-8 character byte lengths. */ |
| #define UCONV_U8_ONE_BYTE (0x7fU) |
| #define UCONV_U8_TWO_BYTES (0x7ffU) |
| #define UCONV_U8_THREE_BYTES (0xffffU) |
| #define UCONV_U8_FOUR_BYTES (0x10ffffU) |
| |
| /* The common minimum and maximum values at the UTF-8 character bytes. */ |
| #define UCONV_U8_BYTE_MIN (0x80U) |
| #define UCONV_U8_BYTE_MAX (0xbfU) |
| |
| /* |
| * The following "6" and "0x3f" came from "10xx xxxx" bit representation of |
| * UTF-8 character bytes. |
| */ |
| #define UCONV_U8_BIT_SHIFT 6 |
| #define UCONV_U8_BIT_MASK 0x3f |
| |
| /* |
| * The following vector shows remaining bytes in a UTF-8 character. |
| * Index will be the first byte of the character. |
| */ |
| static const uchar_t remaining_bytes_tbl[0x100] = { |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| |
| /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ |
| 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| |
| /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| |
| /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| |
| /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ |
| 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| }; |
| |
| /* |
| * The following is a vector of bit-masks to get used bits in |
| * the first byte of a UTF-8 character. Index is remaining bytes at above of |
| * the character. |
| */ |
| #ifdef _KERNEL |
| const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; |
| #else |
| static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; |
| #endif /* _KERNEL */ |
| |
| /* |
| * The following two vectors are to provide valid minimum and |
| * maximum values for the 2'nd byte of a multibyte UTF-8 character for |
| * better illegal sequence checking. The index value must be the value of |
| * the first byte of the UTF-8 character. |
| */ |
| static const uchar_t valid_min_2nd_byte[0x100] = { |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| |
| /* C0 C1 C2 C3 C4 C5 C6 C7 */ |
| 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
| |
| /* C8 C9 CA CB CC CD CE CF */ |
| 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
| |
| /* D0 D1 D2 D3 D4 D5 D6 D7 */ |
| 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
| |
| /* D8 D9 DA DB DC DD DE DF */ |
| 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
| |
| /* E0 E1 E2 E3 E4 E5 E6 E7 */ |
| 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
| |
| /* E8 E9 EA EB EC ED EE EF */ |
| 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
| |
| /* F0 F1 F2 F3 F4 F5 F6 F7 */ |
| 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, |
| |
| 0, 0, 0, 0, 0, 0, 0, 0 |
| }; |
| |
| static const uchar_t valid_max_2nd_byte[0x100] = { |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| |
| /* C0 C1 C2 C3 C4 C5 C6 C7 */ |
| 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, |
| |
| /* C8 C9 CA CB CC CD CE CF */ |
| 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, |
| |
| /* D0 D1 D2 D3 D4 D5 D6 D7 */ |
| 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, |
| |
| /* D8 D9 DA DB DC DD DE DF */ |
| 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, |
| |
| /* E0 E1 E2 E3 E4 E5 E6 E7 */ |
| 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, |
| |
| /* E8 E9 EA EB EC ED EE EF */ |
| 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, |
| |
| /* F0 F1 F2 F3 F4 F5 F6 F7 */ |
| 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, |
| |
| 0, 0, 0, 0, 0, 0, 0, 0 |
| }; |
| |
| |
| static int |
| check_endian(int flag, int *in, int *out) |
| { |
| *in = flag & UCONV_IN_ENDIAN_MASKS; |
| |
| /* You cannot have both. */ |
| if (*in == UCONV_IN_ENDIAN_MASKS) |
| return (EBADF); |
| |
| if (*in == 0) |
| *in = UCONV_IN_NAT_ENDIAN; |
| |
| *out = flag & UCONV_OUT_ENDIAN_MASKS; |
| |
| /* You cannot have both. */ |
| if (*out == UCONV_OUT_ENDIAN_MASKS) |
| return (EBADF); |
| |
| if (*out == 0) |
| *out = UCONV_OUT_NAT_ENDIAN; |
| |
| return (0); |
| } |
| |
| static boolean_t |
| check_bom16(const uint16_t *u16s, size_t u16l, int *in) |
| { |
| if (u16l > 0) { |
| if (*u16s == UCONV_BOM_NORMAL) { |
| *in = UCONV_IN_NAT_ENDIAN; |
| return (B_TRUE); |
| } |
| if (*u16s == UCONV_BOM_SWAPPED) { |
| *in = UCONV_IN_REV_ENDIAN; |
| return (B_TRUE); |
| } |
| } |
| |
| return (B_FALSE); |
| } |
| |
| static boolean_t |
| check_bom32(const uint32_t *u32s, size_t u32l, int *in) |
| { |
| if (u32l > 0) { |
| if (*u32s == UCONV_BOM_NORMAL) { |
| *in = UCONV_IN_NAT_ENDIAN; |
| return (B_TRUE); |
| } |
| if (*u32s == UCONV_BOM_SWAPPED_32) { |
| *in = UCONV_IN_REV_ENDIAN; |
| return (B_TRUE); |
| } |
| } |
| |
| return (B_FALSE); |
| } |
| |
| int |
| uconv_u16tou32(const uint16_t *u16s, size_t *utf16len, |
| uint32_t *u32s, size_t *utf32len, int flag) |
| { |
| int inendian; |
| int outendian; |
| size_t u16l; |
| size_t u32l; |
| uint32_t hi; |
| uint32_t lo; |
| boolean_t do_not_ignore_null; |
| |
| /* |
| * Do preliminary validity checks on parameters and collect info on |
| * endians. |
| */ |
| if (u16s == NULL || utf16len == NULL) |
| return (EILSEQ); |
| |
| if (u32s == NULL || utf32len == NULL) |
| return (E2BIG); |
| |
| if (check_endian(flag, &inendian, &outendian) != 0) |
| return (EBADF); |
| |
| /* |
| * Initialize input and output parameter buffer indices and |
| * temporary variables. |
| */ |
| u16l = u32l = 0; |
| hi = 0; |
| do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); |
| |
| /* |
| * Check on the BOM at the beginning of the input buffer if required |
| * and if there is indeed one, process it. |
| */ |
| if ((flag & UCONV_IN_ACCEPT_BOM) && |
| check_bom16(u16s, *utf16len, &inendian)) |
| u16l++; |
| |
| /* |
| * Reset inendian and outendian so that after this point, those can be |
| * used as condition values. |
| */ |
| inendian &= UCONV_IN_NAT_ENDIAN; |
| outendian &= UCONV_OUT_NAT_ENDIAN; |
| |
| /* |
| * If there is something in the input buffer and if necessary and |
| * requested, save the BOM at the output buffer. |
| */ |
| if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) |
| u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : |
| UCONV_BOM_SWAPPED_32; |
| |
| /* |
| * Do conversion; if encounter a surrogate pair, assemble high and |
| * low pair values to form a UTF-32 character. If a half of a pair |
| * exists alone, then, either it is an illegal (EILSEQ) or |
| * invalid (EINVAL) value. |
| */ |
| for (; u16l < *utf16len; u16l++) { |
| if (u16s[u16l] == 0 && do_not_ignore_null) |
| break; |
| |
| lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); |
| |
| if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { |
| if (hi) |
| return (EILSEQ); |
| hi = lo; |
| continue; |
| } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { |
| if (! hi) |
| return (EILSEQ); |
| lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + |
| lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) |
| + UCONV_U16_START; |
| hi = 0; |
| } else if (hi) { |
| return (EILSEQ); |
| } |
| |
| if (u32l >= *utf32len) |
| return (E2BIG); |
| |
| u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo); |
| } |
| |
| /* |
| * If high half didn't see low half, then, it's most likely the input |
| * parameter is incomplete. |
| */ |
| if (hi) |
| return (EINVAL); |
| |
| /* |
| * Save the number of consumed and saved characters. They do not |
| * include terminating NULL character (U+0000) at the end of |
| * the input buffer (even when UCONV_IGNORE_NULL isn't specified and |
| * the input buffer length is big enough to include the terminating |
| * NULL character). |
| */ |
| *utf16len = u16l; |
| *utf32len = u32l; |
| |
| return (0); |
| } |
| |
| int |
| uconv_u16tou8(const uint16_t *u16s, size_t *utf16len, |
| uchar_t *u8s, size_t *utf8len, int flag) |
| { |
| int inendian; |
| int outendian; |
| size_t u16l; |
| size_t u8l; |
| uint32_t hi; |
| uint32_t lo; |
| boolean_t do_not_ignore_null; |
| |
| if (u16s == NULL || utf16len == NULL) |
| return (EILSEQ); |
| |
| if (u8s == NULL || utf8len == NULL) |
| return (E2BIG); |
| |
| if (check_endian(flag, &inendian, &outendian) != 0) |
| return (EBADF); |
| |
| u16l = u8l = 0; |
| hi = 0; |
| do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); |
| |
| if ((flag & UCONV_IN_ACCEPT_BOM) && |
| check_bom16(u16s, *utf16len, &inendian)) |
| u16l++; |
| |
| inendian &= UCONV_IN_NAT_ENDIAN; |
| |
| for (; u16l < *utf16len; u16l++) { |
| if (u16s[u16l] == 0 && do_not_ignore_null) |
| break; |
| |
| lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); |
| |
| if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { |
| if (hi) |
| return (EILSEQ); |
| hi = lo; |
| continue; |
| } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { |
| if (! hi) |
| return (EILSEQ); |
| lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + |
| lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) |
| + UCONV_U16_START; |
| hi = 0; |
| } else if (hi) { |
| return (EILSEQ); |
| } |
| |
| /* |
| * Now we convert a UTF-32 character into a UTF-8 character. |
| * Unicode coding space is between U+0000 and U+10FFFF; |
| * anything bigger is an illegal character. |
| */ |
| if (lo <= UCONV_U8_ONE_BYTE) { |
| if (u8l >= *utf8len) |
| return (E2BIG); |
| u8s[u8l++] = (uchar_t)lo; |
| } else if (lo <= UCONV_U8_TWO_BYTES) { |
| if ((u8l + 1) >= *utf8len) |
| return (E2BIG); |
| u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); |
| u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); |
| } else if (lo <= UCONV_U8_THREE_BYTES) { |
| if ((u8l + 2) >= *utf8len) |
| return (E2BIG); |
| u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); |
| u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); |
| u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); |
| } else if (lo <= UCONV_U8_FOUR_BYTES) { |
| if ((u8l + 3) >= *utf8len) |
| return (E2BIG); |
| u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); |
| u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); |
| u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); |
| u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); |
| } else { |
| return (EILSEQ); |
| } |
| } |
| |
| if (hi) |
| return (EINVAL); |
| |
| *utf16len = u16l; |
| *utf8len = u8l; |
| |
| return (0); |
| } |
| |
| int |
| uconv_u32tou16(const uint32_t *u32s, size_t *utf32len, |
| uint16_t *u16s, size_t *utf16len, int flag) |
| { |
| int inendian; |
| int outendian; |
| size_t u16l; |
| size_t u32l; |
| uint32_t hi; |
| uint32_t lo; |
| boolean_t do_not_ignore_null; |
| |
| if (u32s == NULL || utf32len == NULL) |
| return (EILSEQ); |
| |
| if (u16s == NULL || utf16len == NULL) |
| return (E2BIG); |
| |
| if (check_endian(flag, &inendian, &outendian) != 0) |
| return (EBADF); |
| |
| u16l = u32l = 0; |
| do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); |
| |
| if ((flag & UCONV_IN_ACCEPT_BOM) && |
| check_bom32(u32s, *utf32len, &inendian)) |
| u32l++; |
| |
| inendian &= UCONV_IN_NAT_ENDIAN; |
| outendian &= UCONV_OUT_NAT_ENDIAN; |
| |
| if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) |
| u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : |
| UCONV_BOM_SWAPPED; |
| |
| for (; u32l < *utf32len; u32l++) { |
| if (u32s[u32l] == 0 && do_not_ignore_null) |
| break; |
| |
| hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); |
| |
| /* |
| * Anything bigger than the Unicode coding space, i.e., |
| * Unicode scalar value bigger than U+10FFFF, is an illegal |
| * character. |
| */ |
| if (hi > UCONV_UNICODE_MAX) |
| return (EILSEQ); |
| |
| /* |
| * Anything bigger than U+FFFF must be converted into |
| * a surrogate pair in UTF-16. |
| */ |
| if (hi >= UCONV_U16_START) { |
| lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + |
| UCONV_U16_LO_MIN; |
| hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + |
| UCONV_U16_HI_MIN; |
| |
| if ((u16l + 1) >= *utf16len) |
| return (E2BIG); |
| |
| if (outendian) { |
| u16s[u16l++] = (uint16_t)hi; |
| u16s[u16l++] = (uint16_t)lo; |
| } else { |
| u16s[u16l++] = BSWAP_16(((uint16_t)hi)); |
| u16s[u16l++] = BSWAP_16(((uint16_t)lo)); |
| } |
| } else { |
| if (u16l >= *utf16len) |
| return (E2BIG); |
| u16s[u16l++] = (outendian) ? (uint16_t)hi : |
| BSWAP_16(((uint16_t)hi)); |
| } |
| } |
| |
| *utf16len = u16l; |
| *utf32len = u32l; |
| |
| return (0); |
| } |
| |
| int |
| uconv_u32tou8(const uint32_t *u32s, size_t *utf32len, |
| uchar_t *u8s, size_t *utf8len, int flag) |
| { |
| int inendian; |
| int outendian; |
| size_t u32l; |
| size_t u8l; |
| uint32_t lo; |
| boolean_t do_not_ignore_null; |
| |
| if (u32s == NULL || utf32len == NULL) |
| return (EILSEQ); |
| |
| if (u8s == NULL || utf8len == NULL) |
| return (E2BIG); |
| |
| if (check_endian(flag, &inendian, &outendian) != 0) |
| return (EBADF); |
| |
| u32l = u8l = 0; |
| do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); |
| |
| if ((flag & UCONV_IN_ACCEPT_BOM) && |
| check_bom32(u32s, *utf32len, &inendian)) |
| u32l++; |
| |
| inendian &= UCONV_IN_NAT_ENDIAN; |
| |
| for (; u32l < *utf32len; u32l++) { |
| if (u32s[u32l] == 0 && do_not_ignore_null) |
| break; |
| |
| lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); |
| |
| if (lo <= UCONV_U8_ONE_BYTE) { |
| if (u8l >= *utf8len) |
| return (E2BIG); |
| u8s[u8l++] = (uchar_t)lo; |
| } else if (lo <= UCONV_U8_TWO_BYTES) { |
| if ((u8l + 1) >= *utf8len) |
| return (E2BIG); |
| u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); |
| u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); |
| } else if (lo <= UCONV_U8_THREE_BYTES) { |
| if ((u8l + 2) >= *utf8len) |
| return (E2BIG); |
| u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); |
| u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); |
| u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); |
| } else if (lo <= UCONV_U8_FOUR_BYTES) { |
| if ((u8l + 3) >= *utf8len) |
| return (E2BIG); |
| u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); |
| u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); |
| u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); |
| u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); |
| } else { |
| return (EILSEQ); |
| } |
| } |
| |
| *utf32len = u32l; |
| *utf8len = u8l; |
| |
| return (0); |
| } |
| |
| int |
| uconv_u8tou16(const uchar_t *u8s, size_t *utf8len, |
| uint16_t *u16s, size_t *utf16len, int flag) |
| { |
| int inendian; |
| int outendian; |
| size_t u16l; |
| size_t u8l; |
| uint32_t hi; |
| uint32_t lo; |
| int remaining_bytes; |
| int first_b; |
| boolean_t do_not_ignore_null; |
| |
| if (u8s == NULL || utf8len == NULL) |
| return (EILSEQ); |
| |
| if (u16s == NULL || utf16len == NULL) |
| return (E2BIG); |
| |
| if (check_endian(flag, &inendian, &outendian) != 0) |
| return (EBADF); |
| |
| u16l = u8l = 0; |
| do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); |
| |
| outendian &= UCONV_OUT_NAT_ENDIAN; |
| |
| if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) |
| u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : |
| UCONV_BOM_SWAPPED; |
| |
| for (; u8l < *utf8len; ) { |
| if (u8s[u8l] == 0 && do_not_ignore_null) |
| break; |
| |
| /* |
| * Collect a UTF-8 character and convert it to a UTF-32 |
| * character. In doing so, we screen out illegally formed |
| * UTF-8 characters and treat such as illegal characters. |
| * The algorithm at below also screens out anything bigger |
| * than the U+10FFFF. |
| * |
| * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for |
| * more details on the illegal values of UTF-8 character |
| * bytes. |
| */ |
| hi = (uint32_t)u8s[u8l++]; |
| |
| if (hi > UCONV_ASCII_MAX) { |
| if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) |
| return (EILSEQ); |
| |
| first_b = hi; |
| hi = hi & u8_masks_tbl[remaining_bytes]; |
| |
| for (; remaining_bytes > 0; remaining_bytes--) { |
| /* |
| * If we have no more bytes, the current |
| * UTF-8 character is incomplete. |
| */ |
| if (u8l >= *utf8len) |
| return (EINVAL); |
| |
| lo = (uint32_t)u8s[u8l++]; |
| |
| if (first_b) { |
| if (lo < valid_min_2nd_byte[first_b] || |
| lo > valid_max_2nd_byte[first_b]) |
| return (EILSEQ); |
| first_b = 0; |
| } else if (lo < UCONV_U8_BYTE_MIN || |
| lo > UCONV_U8_BYTE_MAX) { |
| return (EILSEQ); |
| } |
| hi = (hi << UCONV_U8_BIT_SHIFT) | |
| (lo & UCONV_U8_BIT_MASK); |
| } |
| } |
| |
| if (hi >= UCONV_U16_START) { |
| lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + |
| UCONV_U16_LO_MIN; |
| hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + |
| UCONV_U16_HI_MIN; |
| |
| if ((u16l + 1) >= *utf16len) |
| return (E2BIG); |
| |
| if (outendian) { |
| u16s[u16l++] = (uint16_t)hi; |
| u16s[u16l++] = (uint16_t)lo; |
| } else { |
| u16s[u16l++] = BSWAP_16(((uint16_t)hi)); |
| u16s[u16l++] = BSWAP_16(((uint16_t)lo)); |
| } |
| } else { |
| if (u16l >= *utf16len) |
| return (E2BIG); |
| |
| u16s[u16l++] = (outendian) ? (uint16_t)hi : |
| BSWAP_16(((uint16_t)hi)); |
| } |
| } |
| |
| *utf16len = u16l; |
| *utf8len = u8l; |
| |
| return (0); |
| } |
| |
| int |
| uconv_u8tou32(const uchar_t *u8s, size_t *utf8len, |
| uint32_t *u32s, size_t *utf32len, int flag) |
| { |
| int inendian; |
| int outendian; |
| size_t u32l; |
| size_t u8l; |
| uint32_t hi; |
| uint32_t c; |
| int remaining_bytes; |
| int first_b; |
| boolean_t do_not_ignore_null; |
| |
| if (u8s == NULL || utf8len == NULL) |
| return (EILSEQ); |
| |
| if (u32s == NULL || utf32len == NULL) |
| return (E2BIG); |
| |
| if (check_endian(flag, &inendian, &outendian) != 0) |
| return (EBADF); |
| |
| u32l = u8l = 0; |
| do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); |
| |
| outendian &= UCONV_OUT_NAT_ENDIAN; |
| |
| if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) |
| u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : |
| UCONV_BOM_SWAPPED_32; |
| |
| for (; u8l < *utf8len; ) { |
| if (u8s[u8l] == 0 && do_not_ignore_null) |
| break; |
| |
| hi = (uint32_t)u8s[u8l++]; |
| |
| if (hi > UCONV_ASCII_MAX) { |
| if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) |
| return (EILSEQ); |
| |
| first_b = hi; |
| hi = hi & u8_masks_tbl[remaining_bytes]; |
| |
| for (; remaining_bytes > 0; remaining_bytes--) { |
| if (u8l >= *utf8len) |
| return (EINVAL); |
| |
| c = (uint32_t)u8s[u8l++]; |
| |
| if (first_b) { |
| if (c < valid_min_2nd_byte[first_b] || |
| c > valid_max_2nd_byte[first_b]) |
| return (EILSEQ); |
| first_b = 0; |
| } else if (c < UCONV_U8_BYTE_MIN || |
| c > UCONV_U8_BYTE_MAX) { |
| return (EILSEQ); |
| } |
| hi = (hi << UCONV_U8_BIT_SHIFT) | |
| (c & UCONV_U8_BIT_MASK); |
| } |
| } |
| |
| if (u32l >= *utf32len) |
| return (E2BIG); |
| |
| u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi); |
| } |
| |
| *utf32len = u32l; |
| *utf8len = u8l; |
| |
| return (0); |
| } |