zfs/lib/libport/u8_textprep.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26
  27 #include "zfs_config.h"
  28
  29 #ifndef HAVE_UNICODE
  30
  31 /*
  32  * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
  33  *
  34  * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
  35  * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
  36  * the section 3C man pages.
  37  * Interface stability: Committed.
  38  */
  39
  40 #include <sys/types.h>
  41 #ifdef  _KERNEL
  42 #include <sys/param.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/systm.h>
  45 #include <sys/debug.h>
  46 #include <sys/kmem.h>
  47 #include <sys/ddi.h>
  48 #include <sys/sunddi.h>
  49 #else
  50 #include <sys/u8_textprep.h>
  51 #include <strings.h>
  52 #endif  /* _KERNEL */
  53 #include <sys/byteorder.h>
  54 #include <sys/errno.h>
  55 #include <sys/u8_textprep_data.h>
  56
  57 #undef errno
  58
  59 /* The maximum possible number of bytes in a UTF-8 character. */
  60 #define U8_MB_CUR_MAX                   (4)
  61
  62 /*
  63  * The maximum number of bytes needed for a UTF-8 character to cover
  64  * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
  65  */
  66 #define U8_MAX_BYTES_UCS2               (3)
  67
  68 /* The maximum possible number of bytes in a Stream-Safe Text. */
  69 #define U8_STREAM_SAFE_TEXT_MAX         (128)
  70
  71 /*
  72  * The maximum number of characters in a combining/conjoining sequence and
  73  * the actual upperbound limit of a combining/conjoining sequence.
  74  */
  75 #define U8_MAX_CHARS_A_SEQ              (32)
  76 #define U8_UPPER_LIMIT_IN_A_SEQ         (31)
  77
  78 /* The combining class value for Starter. */
  79 #define U8_COMBINING_CLASS_STARTER      (0)
  80
  81 /*
  82  * Some Hangul related macros at below.
  83  *
  84  * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
  85  * Vowels, and optional Trailing consonants in Unicode scalar values.
  86  *
  87  * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
  88  * the actual U+11A8. This is due to that the trailing consonant is optional
  89  * and thus we are doing a pre-calculation of subtracting one.
  90  *
  91  * Each of 19 modern leading consonants has total 588 possible syllables since
  92  * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
  93  * no trailing consonant case, i.e., 21 x 28 = 588.
  94  *
  95  * We also have bunch of Hangul related macros at below. Please bear in mind
  96  * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
  97  * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
  98  * Jamo; it just guarantee that it will be most likely.
  99  */
 100 #define U8_HANGUL_SYL_FIRST             (0xAC00U)
 101 #define U8_HANGUL_SYL_LAST              (0xD7A3U)
 102
 103 #define U8_HANGUL_JAMO_L_FIRST          (0x1100U)
 104 #define U8_HANGUL_JAMO_L_LAST           (0x1112U)
 105 #define U8_HANGUL_JAMO_V_FIRST          (0x1161U)
 106 #define U8_HANGUL_JAMO_V_LAST           (0x1175U)
 107 #define U8_HANGUL_JAMO_T_FIRST          (0x11A7U)
 108 #define U8_HANGUL_JAMO_T_LAST           (0x11C2U)
 109
 110 #define U8_HANGUL_V_COUNT               (21)
 111 #define U8_HANGUL_VT_COUNT              (588)
 112 #define U8_HANGUL_T_COUNT               (28)
 113
 114 #define U8_HANGUL_JAMO_1ST_BYTE         (0xE1U)
 115
 116 #define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
 117         (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
 118         (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
 119         (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
 120
 121 #define U8_HANGUL_JAMO_L(u) \
 122         ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
 123
 124 #define U8_HANGUL_JAMO_V(u) \
 125         ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
 126
 127 #define U8_HANGUL_JAMO_T(u) \
 128         ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
 129
 130 #define U8_HANGUL_JAMO(u) \
 131         ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
 132
 133 #define U8_HANGUL_SYLLABLE(u) \
 134         ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
 135
 136 #define U8_HANGUL_COMPOSABLE_L_V(s, u) \
 137         ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
 138
 139 #define U8_HANGUL_COMPOSABLE_LV_T(s, u) \
 140         ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
 141
 142 /* The types of decomposition mappings. */
 143 #define U8_DECOMP_BOTH                  (0xF5U)
 144 #define U8_DECOMP_CANONICAL             (0xF6U)
 145
 146 /* The indicator for 16-bit table. */
 147 #define U8_16BIT_TABLE_INDICATOR        (0x8000U)
 148
 149 /* The following are some convenience macros. */
 150 #define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
 151         (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
 152                 (uint32_t)(b3) & 0x3F;
 153
 154 #define U8_SIMPLE_SWAP(a, b, t) \
 155         (t) = (a); \
 156         (a) = (b); \
 157         (b) = (t);
 158
 159 #define U8_ASCII_TOUPPER(c) \
 160         (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
 161
 162 #define U8_ASCII_TOLOWER(c) \
 163         (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
 164
 165 #define U8_ISASCII(c)                   (((uchar_t)(c)) < 0x80U)
 166 /*
 167  * The following macro assumes that the two characters that are to be
 168  * swapped are adjacent to each other and 'a' comes before 'b'.
 169  *
 170  * If the assumptions are not met, then, the macro will fail.
 171  */
 172 #define U8_SWAP_COMB_MARKS(a, b) \
 173         for (k = 0; k < disp[(a)]; k++) \
 174                 u8t[k] = u8s[start[(a)] + k]; \
 175         for (k = 0; k < disp[(b)]; k++) \
 176                 u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
 177         start[(b)] = start[(a)] + disp[(b)]; \
 178         for (k = 0; k < disp[(a)]; k++) \
 179                 u8s[start[(b)] + k] = u8t[k]; \
 180         U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
 181         U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
 182
 183 /* The possible states during normalization. */
 184 typedef enum {
 185         U8_STATE_START = 0,
 186         U8_STATE_HANGUL_L = 1,
 187         U8_STATE_HANGUL_LV = 2,
 188         U8_STATE_HANGUL_LVT = 3,
 189         U8_STATE_HANGUL_V = 4,
 190         U8_STATE_HANGUL_T = 5,
 191         U8_STATE_COMBINING_MARK = 6
 192 } u8_normalization_states_t;
 193
 194 /*
 195  * The three vectors at below are used to check bytes of a given UTF-8
 196  * character are valid and not containing any malformed byte values.
 197  *
 198  * We used to have a quite relaxed UTF-8 binary representation but then there
 199  * was some security related issues and so the Unicode Consortium defined
 200  * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
 201  * one more time at the Unicode 3.2. The following three tables are based on
 202  * that.
 203  */
 204
 205 #define U8_ILLEGAL_NEXT_BYTE_COMMON(c)  ((c) < 0x80 || (c) > 0xBF)
 206
 207 #define I_                              U8_ILLEGAL_CHAR
 208 #define O_                              U8_OUT_OF_RANGE_CHAR
 209
 210 const int8_t u8_number_of_bytes[0x100] = {
 211         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 212         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 213         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 214         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 215         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 216         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 217         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 218         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 219
 220 /*      80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
 221         I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
 222
 223 /*      90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
 224         I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
 225
 226 /*      A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
 227         I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
 228
 229 /*      B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
 230         I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
 231
 232 /*      C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
 233         I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 234
 235 /*      D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
 236         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 237
 238 /*      E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
 239         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
 240
 241 /*      F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
 242         4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
 243 };
 244
 245 #undef  I_
 246 #undef  O_
 247
 248 const uint8_t u8_valid_min_2nd_byte[0x100] = {
 249         0,    0,    0,    0,    0,    0,    0,    0,
 250         0,    0,    0,    0,    0,    0,    0,    0,
 251         0,    0,    0,    0,    0,    0,    0,    0,
 252         0,    0,    0,    0,    0,    0,    0,    0,
 253         0,    0,    0,    0,    0,    0,    0,    0,
 254         0,    0,    0,    0,    0,    0,    0,    0,
 255         0,    0,    0,    0,    0,    0,    0,    0,
 256         0,    0,    0,    0,    0,    0,    0,    0,
 257         0,    0,    0,    0,    0,    0,    0,    0,
 258         0,    0,    0,    0,    0,    0,    0,    0,
 259         0,    0,    0,    0,    0,    0,    0,    0,
 260         0,    0,    0,    0,    0,    0,    0,    0,
 261         0,    0,    0,    0,    0,    0,    0,    0,
 262         0,    0,    0,    0,    0,    0,    0,    0,
 263         0,    0,    0,    0,    0,    0,    0,    0,
 264         0,    0,    0,    0,    0,    0,    0,    0,
 265         0,    0,    0,    0,    0,    0,    0,    0,
 266         0,    0,    0,    0,    0,    0,    0,    0,
 267         0,    0,    0,    0,    0,    0,    0,    0,
 268         0,    0,    0,    0,    0,    0,    0,    0,
 269         0,    0,    0,    0,    0,    0,    0,    0,
 270         0,    0,    0,    0,    0,    0,    0,    0,
 271         0,    0,    0,    0,    0,    0,    0,    0,
 272         0,    0,    0,    0,    0,    0,    0,    0,
 273 /*      C0    C1    C2    C3    C4    C5    C6    C7    */
 274         0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 275 /*      C8    C9    CA    CB    CC    CD    CE    CF    */
 276         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 277 /*      D0    D1    D2    D3    D4    D5    D6    D7    */
 278         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 279 /*      D8    D9    DA    DB    DC    DD    DE    DF    */
 280         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 281 /*      E0    E1    E2    E3    E4    E5    E6    E7    */
 282         0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 283 /*      E8    E9    EA    EB    EC    ED    EE    EF    */
 284         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 285 /*      F0    F1    F2    F3    F4    F5    F6    F7    */
 286         0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
 287         0,    0,    0,    0,    0,    0,    0,    0,
 288 };
 289
 290 const uint8_t u8_valid_max_2nd_byte[0x100] = {
 291         0,    0,    0,    0,    0,    0,    0,    0,
 292         0,    0,    0,    0,    0,    0,    0,    0,
 293         0,    0,    0,    0,    0,    0,    0,    0,
 294         0,    0,    0,    0,    0,    0,    0,    0,
 295         0,    0,    0,    0,    0,    0,    0,    0,
 296         0,    0,    0,    0,    0,    0,    0,    0,
 297         0,    0,    0,    0,    0,    0,    0,    0,
 298         0,    0,    0,    0,    0,    0,    0,    0,
 299         0,    0,    0,    0,    0,    0,    0,    0,
 300         0,    0,    0,    0,    0,    0,    0,    0,
 301         0,    0,    0,    0,    0,    0,    0,    0,
 302         0,    0,    0,    0,    0,    0,    0,    0,
 303         0,    0,    0,    0,    0,    0,    0,    0,
 304         0,    0,    0,    0,    0,    0,    0,    0,
 305         0,    0,    0,    0,    0,    0,    0,    0,
 306         0,    0,    0,    0,    0,    0,    0,    0,
 307         0,    0,    0,    0,    0,    0,    0,    0,
 308         0,    0,    0,    0,    0,    0,    0,    0,
 309         0,    0,    0,    0,    0,    0,    0,    0,
 310         0,    0,    0,    0,    0,    0,    0,    0,
 311         0,    0,    0,    0,    0,    0,    0,    0,
 312         0,    0,    0,    0,    0,    0,    0,    0,
 313         0,    0,    0,    0,    0,    0,    0,    0,
 314         0,    0,    0,    0,    0,    0,    0,    0,
 315 /*      C0    C1    C2    C3    C4    C5    C6    C7    */
 316         0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 317 /*      C8    C9    CA    CB    CC    CD    CE    CF    */
 318         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 319 /*      D0    D1    D2    D3    D4    D5    D6    D7    */
 320         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 321 /*      D8    D9    DA    DB    DC    DD    DE    DF    */
 322         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 323 /*      E0    E1    E2    E3    E4    E5    E6    E7    */
 324         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 325 /*      E8    E9    EA    EB    EC    ED    EE    EF    */
 326         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
 327 /*      F0    F1    F2    F3    F4    F5    F6    F7    */
 328         0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
 329         0,    0,    0,    0,    0,    0,    0,    0,
 330 };
 331
 332
 333 /*
 334  * The u8_validate() validates on the given UTF-8 character string and
 335  * calculate the byte length. It is quite similar to mblen(3C) except that
 336  * this will validate against the list of characters if required and
 337  * specific to UTF-8 and Unicode.
 338  */
 339 int
 340 u8_validate(char *u8str, size_t n, char **list, int flag, int *errno)
 341 {
 342         uchar_t *ib;
 343         uchar_t *ibtail;
 344         uchar_t **p;
 345         uchar_t *s1;
 346         uchar_t *s2;
 347         uchar_t f;
 348         int sz;
 349         size_t i;
 350         int ret_val;
 351         boolean_t second;
 352         boolean_t no_need_to_validate_entire;
 353         boolean_t check_additional;
 354         boolean_t validate_ucs2_range_only;
 355
 356         if (! u8str)
 357                 return (0);
 358
 359         ib = (uchar_t *)u8str;
 360         ibtail = ib + n;
 361
 362         ret_val = 0;
 363
 364         no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
 365         check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
 366         validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
 367
 368         while (ib < ibtail) {
 369                 /*
 370                  * The first byte of a UTF-8 character tells how many
 371                  * bytes will follow for the character. If the first byte
 372                  * is an illegal byte value or out of range value, we just
 373                  * return -1 with an appropriate error number.
 374                  */
 375                 sz = u8_number_of_bytes[*ib];
 376                 if (sz == U8_ILLEGAL_CHAR) {
 377                         *errno = EILSEQ;
 378                         return (-1);
 379                 }
 380
 381                 if (sz == U8_OUT_OF_RANGE_CHAR ||
 382                     (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
 383                         *errno = ERANGE;
 384                         return (-1);
 385                 }
 386
 387                 /*
 388                  * If we don't have enough bytes to check on, that's also
 389                  * an error. As you can see, we give illegal byte sequence
 390                  * checking higher priority then EINVAL cases.
 391                  */
 392                 if ((ibtail - ib) < sz) {
 393                         *errno = EINVAL;
 394                         return (-1);
 395                 }
 396
 397                 if (sz == 1) {
 398                         ib++;
 399                         ret_val++;
 400                 } else {
 401                         /*
 402                          * Check on the multi-byte UTF-8 character. For more
 403                          * details on this, see comment added for the used
 404                          * data structures at the beginning of the file.
 405                          */
 406                         f = *ib++;
 407                         ret_val++;
 408                         second = B_TRUE;
 409                         for (i = 1; i < sz; i++) {
 410                                 if (second) {
 411                                         if (*ib < u8_valid_min_2nd_byte[f] ||
 412                                             *ib > u8_valid_max_2nd_byte[f]) {
 413                                                 *errno = EILSEQ;
 414                                                 return (-1);
 415                                         }
 416                                         second = B_FALSE;
 417                                 } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
 418                                         *errno = EILSEQ;
 419                                         return (-1);
 420                                 }
 421                                 ib++;
 422                                 ret_val++;
 423                         }
 424                 }
 425
 426                 if (check_additional) {
 427                         for (p = (uchar_t **)list, i = 0; p[i]; i++) {
 428                                 s1 = ib - sz;
 429                                 s2 = p[i];
 430                                 while (s1 < ib) {
 431                                         if (*s1 != *s2 || *s2 == '\0')
 432                                                 break;
 433                                         s1++;
 434                                         s2++;
 435                                 }
 436
 437                                 if (s1 >= ib && *s2 == '\0') {
 438                                         *errno = EBADF;
 439                                         return (-1);
 440                                 }
 441                         }
 442                 }
 443
 444                 if (no_need_to_validate_entire)
 445                         break;
 446         }
 447
 448         return (ret_val);
 449 }
 450
 451 /*
 452  * The do_case_conv() looks at the mapping tables and returns found
 453  * bytes if any. If not found, the input bytes are returned. The function
 454  * always terminate the return bytes with a null character assuming that
 455  * there are plenty of room to do so.
 456  *
 457  * The case conversions are simple case conversions mapping a character to
 458  * another character as specified in the Unicode data. The byte size of
 459  * the mapped character could be different from that of the input character.
 460  *
 461  * The return value is the byte length of the returned character excluding
 462  * the terminating null byte.
 463  */
 464 static size_t
 465 do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
 466 {
 467         size_t i;
 468         uint16_t b1 = 0;
 469         uint16_t b2 = 0;
 470         uint16_t b3 = 0;
 471         uint16_t b3_tbl;
 472         uint16_t b3_base;
 473         uint16_t b4 = 0;
 474         size_t start_id;
 475         size_t end_id;
 476
 477         /*
 478          * At this point, the only possible values for sz are 2, 3, and 4.
 479          * The u8s should point to a vector that is well beyond the size of
 480          * 5 bytes.
 481          */
 482         if (sz == 2) {
 483                 b3 = u8s[0] = s[0];
 484                 b4 = u8s[1] = s[1];
 485         } else if (sz == 3) {
 486                 b2 = u8s[0] = s[0];
 487                 b3 = u8s[1] = s[1];
 488                 b4 = u8s[2] = s[2];
 489         } else if (sz == 4) {
 490                 b1 = u8s[0] = s[0];
 491                 b2 = u8s[1] = s[1];
 492                 b3 = u8s[2] = s[2];
 493                 b4 = u8s[3] = s[3];
 494         } else {
 495                 /* This is not possible but just in case as a fallback. */
 496                 if (is_it_toupper)
 497                         *u8s = U8_ASCII_TOUPPER(*s);
 498                 else
 499                         *u8s = U8_ASCII_TOLOWER(*s);
 500                 u8s[1] = '\0';
 501
 502                 return (1);
 503         }
 504         u8s[sz] = '\0';
 505
 506         /*
 507          * Let's find out if we have a corresponding character.
 508          */
 509         b1 = u8_common_b1_tbl[uv][b1];
 510         if (b1 == U8_TBL_ELEMENT_NOT_DEF)
 511                 return ((size_t)sz);
 512
 513         b2 = u8_case_common_b2_tbl[uv][b1][b2];
 514         if (b2 == U8_TBL_ELEMENT_NOT_DEF)
 515                 return ((size_t)sz);
 516
 517         if (is_it_toupper) {
 518                 b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
 519                 if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
 520                         return ((size_t)sz);
 521
 522                 start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
 523                 end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
 524
 525                 /* Either there is no match or an error at the table. */
 526                 if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
 527                         return ((size_t)sz);
 528
 529                 b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
 530
 531                 for (i = 0; start_id < end_id; start_id++)
 532                         u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
 533         } else {
 534                 b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
 535                 if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
 536                         return ((size_t)sz);
 537
 538                 start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
 539                 end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
 540
 541                 if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
 542                         return ((size_t)sz);
 543
 544                 b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
 545
 546                 for (i = 0; start_id < end_id; start_id++)
 547                         u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
 548         }
 549
 550         /*
 551          * If i is still zero, that means there is no corresponding character.
 552          */
 553         if (i == 0)
 554                 return ((size_t)sz);
 555
 556         u8s[i] = '\0';
 557
 558         return (i);
 559 }
 560
 561 /*
 562  * The do_case_compare() function compares the two input strings, s1 and s2,
 563  * one character at a time doing case conversions if applicable and return
 564  * the comparison result as like strcmp().
 565  *
 566  * Since, in empirical sense, most of text data are 7-bit ASCII characters,
 567  * we treat the 7-bit ASCII characters as a special case trying to yield
 568  * faster processing time.
 569  */
 570 static int
 571 do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
 572         size_t n2, boolean_t is_it_toupper, int *errno)
 573 {
 574         int f;
 575         int sz1;
 576         int sz2;
 577         size_t j;
 578         size_t i1;
 579         size_t i2;
 580         uchar_t u8s1[U8_MB_CUR_MAX + 1];
 581         uchar_t u8s2[U8_MB_CUR_MAX + 1];
 582
 583         i1 = i2 = 0;
 584         while (i1 < n1 && i2 < n2) {
 585                 /*
 586                  * Find out what would be the byte length for this UTF-8
 587                  * character at string s1 and also find out if this is
 588                  * an illegal start byte or not and if so, issue a proper
 589                  * errno and yet treat this byte as a character.
 590                  */
 591                 sz1 = u8_number_of_bytes[*s1];
 592                 if (sz1 < 0) {
 593                         *errno = EILSEQ;
 594                         sz1 = 1;
 595                 }
 596
 597                 /*
 598                  * For 7-bit ASCII characters mainly, we do a quick case
 599                  * conversion right at here.
 600                  *
 601                  * If we don't have enough bytes for this character, issue
 602                  * an EINVAL error and use what are available.
 603                  *
 604                  * If we have enough bytes, find out if there is
 605                  * a corresponding uppercase character and if so, copy over
 606                  * the bytes for a comparison later. If there is no
 607                  * corresponding uppercase character, then, use what we have
 608                  * for the comparison.
 609                  */
 610                 if (sz1 == 1) {
 611                         if (is_it_toupper)
 612                                 u8s1[0] = U8_ASCII_TOUPPER(*s1);
 613                         else
 614                                 u8s1[0] = U8_ASCII_TOLOWER(*s1);
 615                         s1++;
 616                         u8s1[1] = '\0';
 617                 } else if ((i1 + sz1) > n1) {
 618                         *errno = EINVAL;
 619                         for (j = 0; (i1 + j) < n1; )
 620                                 u8s1[j++] = *s1++;
 621                         u8s1[j] = '\0';
 622                 } else {
 623                         (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
 624                         s1 += sz1;
 625                 }
 626
 627                 /* Do the same for the string s2. */
 628                 sz2 = u8_number_of_bytes[*s2];
 629                 if (sz2 < 0) {
 630                         *errno = EILSEQ;
 631                         sz2 = 1;
 632                 }
 633
 634                 if (sz2 == 1) {
 635                         if (is_it_toupper)
 636                                 u8s2[0] = U8_ASCII_TOUPPER(*s2);
 637                         else
 638                                 u8s2[0] = U8_ASCII_TOLOWER(*s2);
 639                         s2++;
 640                         u8s2[1] = '\0';
 641                 } else if ((i2 + sz2) > n2) {
 642                         *errno = EINVAL;
 643                         for (j = 0; (i2 + j) < n2; )
 644                                 u8s2[j++] = *s2++;
 645                         u8s2[j] = '\0';
 646                 } else {
 647                         (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
 648                         s2 += sz2;
 649                 }
 650
 651                 /* Now compare the two characters. */
 652                 if (sz1 == 1 && sz2 == 1) {
 653                         if (*u8s1 > *u8s2)
 654                                 return (1);
 655                         if (*u8s1 < *u8s2)
 656                                 return (-1);
 657                 } else {
 658                         f = strcmp((const char *)u8s1, (const char *)u8s2);
 659                         if (f != 0)
 660                                 return (f);
 661                 }
 662
 663                 /*
 664                  * They were the same. Let's move on to the next
 665                  * characters then.
 666                  */
 667                 i1 += sz1;
 668                 i2 += sz2;
 669         }
 670
 671         /*
 672          * We compared until the end of either or both strings.
 673          *
 674          * If we reached to or went over the ends for the both, that means
 675          * they are the same.
 676          *
 677          * If we reached only one of the two ends, that means the other string
 678          * has something which then the fact can be used to determine
 679          * the return value.
 680          */
 681         if (i1 >= n1) {
 682                 if (i2 >= n2)
 683                         return (0);
 684                 return (-1);
 685         }
 686         return (1);
 687 }
 688
 689 /*
 690  * The combining_class() function checks on the given bytes and find out
 691  * the corresponding Unicode combining class value. The return value 0 means
 692  * it is a Starter. Any illegal UTF-8 character will also be treated as
 693  * a Starter.
 694  */
 695 static uchar_t
 696 combining_class(size_t uv, uchar_t *s, size_t sz)
 697 {
 698         uint16_t b1 = 0;
 699         uint16_t b2 = 0;
 700         uint16_t b3 = 0;
 701         uint16_t b4 = 0;
 702
 703         if (sz == 1 || sz > 4)
 704                 return (0);
 705
 706         if (sz == 2) {
 707                 b3 = s[0];
 708                 b4 = s[1];
 709         } else if (sz == 3) {
 710                 b2 = s[0];
 711                 b3 = s[1];
 712                 b4 = s[2];
 713         } else if (sz == 4) {
 714                 b1 = s[0];
 715                 b2 = s[1];
 716                 b3 = s[2];
 717                 b4 = s[3];
 718         }
 719
 720         b1 = u8_common_b1_tbl[uv][b1];
 721         if (b1 == U8_TBL_ELEMENT_NOT_DEF)
 722                 return (0);
 723
 724         b2 = u8_combining_class_b2_tbl[uv][b1][b2];
 725         if (b2 == U8_TBL_ELEMENT_NOT_DEF)
 726                 return (0);
 727
 728         b3 = u8_combining_class_b3_tbl[uv][b2][b3];
 729         if (b3 == U8_TBL_ELEMENT_NOT_DEF)
 730                 return (0);
 731
 732         return (u8_combining_class_b4_tbl[uv][b3][b4]);
 733 }
 734
 735 /*
 736  * The do_decomp() function finds out a matching decomposition if any
 737  * and return. If there is no match, the input bytes are copied and returned.
 738  * The function also checks if there is a Hangul, decomposes it if necessary
 739  * and returns.
 740  *
 741  * To save time, a single byte 7-bit ASCII character should be handled by
 742  * the caller.
 743  *
 744  * The function returns the number of bytes returned sans always terminating
 745  * the null byte. It will also return a state that will tell if there was
 746  * a Hangul character decomposed which then will be used by the caller.
 747  */
 748 static size_t
 749 do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
 750         boolean_t canonical_decomposition, u8_normalization_states_t *state)
 751 {
 752         uint16_t b1 = 0;
 753         uint16_t b2 = 0;
 754         uint16_t b3 = 0;
 755         uint16_t b3_tbl;
 756         uint16_t b3_base;
 757         uint16_t b4 = 0;
 758         size_t start_id;
 759         size_t end_id;
 760         size_t i;
 761         uint32_t u1;
 762
 763         if (sz == 2) {
 764                 b3 = u8s[0] = s[0];
 765                 b4 = u8s[1] = s[1];
 766                 u8s[2] = '\0';
 767         } else if (sz == 3) {
 768                 /* Convert it to a Unicode scalar value. */
 769                 U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
 770
 771                 /*
 772                  * If this is a Hangul syllable, we decompose it into
 773                  * a leading consonant, a vowel, and an optional trailing
 774                  * consonant and then return.
 775                  */
 776                 if (U8_HANGUL_SYLLABLE(u1)) {
 777                         u1 -= U8_HANGUL_SYL_FIRST;
 778
 779                         b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
 780                         b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
 781                             / U8_HANGUL_T_COUNT;
 782                         b3 = u1 % U8_HANGUL_T_COUNT;
 783
 784                         U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
 785                         U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
 786                         if (b3) {
 787                                 b3 += U8_HANGUL_JAMO_T_FIRST;
 788                                 U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
 789
 790                                 u8s[9] = '\0';
 791                                 *state = U8_STATE_HANGUL_LVT;
 792                                 return (9);
 793                         }
 794
 795                         u8s[6] = '\0';
 796                         *state = U8_STATE_HANGUL_LV;
 797                         return (6);
 798                 }
 799
 800                 b2 = u8s[0] = s[0];
 801                 b3 = u8s[1] = s[1];
 802                 b4 = u8s[2] = s[2];
 803                 u8s[3] = '\0';
 804
 805                 /*
 806                  * If this is a Hangul Jamo, we know there is nothing
 807                  * further that we can decompose.
 808                  */
 809                 if (U8_HANGUL_JAMO_L(u1)) {
 810                         *state = U8_STATE_HANGUL_L;
 811                         return (3);
 812                 }
 813
 814                 if (U8_HANGUL_JAMO_V(u1)) {
 815                         if (*state == U8_STATE_HANGUL_L)
 816                                 *state = U8_STATE_HANGUL_LV;
 817                         else
 818                                 *state = U8_STATE_HANGUL_V;
 819                         return (3);
 820                 }
 821
 822                 if (U8_HANGUL_JAMO_T(u1)) {
 823                         if (*state == U8_STATE_HANGUL_LV)
 824                                 *state = U8_STATE_HANGUL_LVT;
 825                         else
 826                                 *state = U8_STATE_HANGUL_T;
 827                         return (3);
 828                 }
 829         } else if (sz == 4) {
 830                 b1 = u8s[0] = s[0];
 831                 b2 = u8s[1] = s[1];
 832                 b3 = u8s[2] = s[2];
 833                 b4 = u8s[3] = s[3];
 834                 u8s[4] = '\0';
 835         } else {
 836                 /*
 837                  * This is a fallback and should not happen if the function
 838                  * was called properly.
 839                  */
 840                 u8s[0] = s[0];
 841                 u8s[1] = '\0';
 842                 *state = U8_STATE_START;
 843                 return (1);
 844         }
 845
 846         /*
 847          * At this point, this rountine does not know what it would get.
 848          * The caller should sort it out if the state isn't a Hangul one.
 849          */
 850         *state = U8_STATE_START;
 851
 852         /* Try to find matching decomposition mapping byte sequence. */
 853         b1 = u8_common_b1_tbl[uv][b1];
 854         if (b1 == U8_TBL_ELEMENT_NOT_DEF)
 855                 return ((size_t)sz);
 856
 857         b2 = u8_decomp_b2_tbl[uv][b1][b2];
 858         if (b2 == U8_TBL_ELEMENT_NOT_DEF)
 859                 return ((size_t)sz);
 860
 861         b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
 862         if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
 863                 return ((size_t)sz);
 864
 865         /*
 866          * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
 867          * which is 0x8000, this means we couldn't fit the mappings into
 868          * the cardinality of a unsigned byte.
 869          */
 870         if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
 871                 b3_tbl -= U8_16BIT_TABLE_INDICATOR;
 872                 start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
 873                 end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
 874         } else {
 875                 start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
 876                 end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
 877         }
 878
 879         /* This also means there wasn't any matching decomposition. */
 880         if (start_id >= end_id)
 881                 return ((size_t)sz);
 882
 883         /*
 884          * The final table for decomposition mappings has three types of
 885          * byte sequences depending on whether a mapping is for compatibility
 886          * decomposition, canonical decomposition, or both like the following:
 887          *
 888          * (1) Compatibility decomposition mappings:
 889          *
 890          *      +---+---+-...-+---+
 891          *      | B0| B1| ... | Bm|
 892          *      +---+---+-...-+---+
 893          *
 894          *      The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
 895          *
 896          * (2) Canonical decomposition mappings:
 897          *
 898          *      +---+---+---+-...-+---+
 899          *      | T | b0| b1| ... | bn|
 900          *      +---+---+---+-...-+---+
 901          *
 902          *      where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
 903          *
 904          * (3) Both mappings:
 905          *
 906          *      +---+---+---+---+-...-+---+---+---+-...-+---+
 907          *      | T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
 908          *      +---+---+---+---+-...-+---+---+---+-...-+---+
 909          *
 910          *      where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
 911          *      byte, b0 to bn are canonical mapping bytes and B0 to Bm are
 912          *      compatibility mapping bytes.
 913          *
 914          * Note that compatibility decomposition means doing recursive
 915          * decompositions using both compatibility decomposition mappings and
 916          * canonical decomposition mappings. On the other hand, canonical
 917          * decomposition means doing recursive decompositions using only
 918          * canonical decomposition mappings. Since the table we have has gone
 919          * through the recursions already, we do not need to do so during
 920          * runtime, i.e., the table has been completely flattened out
 921          * already.
 922          */
 923
 924         b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
 925
 926         /* Get the type, T, of the byte sequence. */
 927         b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
 928
 929         /*
 930          * If necessary, adjust start_id, end_id, or both. Note that if
 931          * this is compatibility decomposition mapping, there is no
 932          * adjustment.
 933          */
 934         if (canonical_decomposition) {
 935                 /* Is the mapping only for compatibility decomposition? */
 936                 if (b1 < U8_DECOMP_BOTH)
 937                         return ((size_t)sz);
 938
 939                 start_id++;
 940
 941                 if (b1 == U8_DECOMP_BOTH) {
 942                         end_id = start_id +
 943                             u8_decomp_final_tbl[uv][b3_base + start_id];
 944                         start_id++;
 945                 }
 946         } else {
 947                 /*
 948                  * Unless this is a compatibility decomposition mapping,
 949                  * we adjust the start_id.
 950                  */
 951                 if (b1 == U8_DECOMP_BOTH) {
 952                         start_id++;
 953                         start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
 954                 } else if (b1 == U8_DECOMP_CANONICAL) {
 955                         start_id++;
 956                 }
 957         }
 958
 959         for (i = 0; start_id < end_id; start_id++)
 960                 u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
 961         u8s[i] = '\0';
 962
 963         return (i);
 964 }
 965
 966 /*
 967  * The find_composition_start() function uses the character bytes given and
 968  * find out the matching composition mappings if any and return the address
 969  * to the composition mappings as explained in the do_composition().
 970  */
 971 static uchar_t *
 972 find_composition_start(size_t uv, uchar_t *s, size_t sz)
 973 {
 974         uint16_t b1 = 0;
 975         uint16_t b2 = 0;
 976         uint16_t b3 = 0;
 977         uint16_t b3_tbl;
 978         uint16_t b3_base;
 979         uint16_t b4 = 0;
 980         size_t start_id;
 981         size_t end_id;
 982
 983         if (sz == 1) {
 984                 b4 = s[0];
 985         } else if (sz == 2) {
 986                 b3 = s[0];
 987                 b4 = s[1];
 988         } else if (sz == 3) {
 989                 b2 = s[0];
 990                 b3 = s[1];
 991                 b4 = s[2];
 992         } else if (sz == 4) {
 993                 b1 = s[0];
 994                 b2 = s[1];
 995                 b3 = s[2];
 996                 b4 = s[3];
 997         } else {
 998                 /*
 999                  * This is a fallback and should not happen if the function
1000                  * was called properly.
1001                  */
1002                 return (NULL);
1003         }
1004
1005         b1 = u8_composition_b1_tbl[uv][b1];
1006         if (b1 == U8_TBL_ELEMENT_NOT_DEF)
1007                 return (NULL);
1008
1009         b2 = u8_composition_b2_tbl[uv][b1][b2];
1010         if (b2 == U8_TBL_ELEMENT_NOT_DEF)
1011                 return (NULL);
1012
1013         b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
1014         if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
1015                 return (NULL);
1016
1017         if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
1018                 b3_tbl -= U8_16BIT_TABLE_INDICATOR;
1019                 start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
1020                 end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
1021         } else {
1022                 start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
1023                 end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
1024         }
1025
1026         if (start_id >= end_id)
1027                 return (NULL);
1028
1029         b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
1030
1031         return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
1032 }
1033
1034 /*
1035  * The blocked() function checks on the combining class values of previous
1036  * characters in this sequence and return whether it is blocked or not.
1037  */
1038 static boolean_t
1039 blocked(uchar_t *comb_class, size_t last)
1040 {
1041         uchar_t my_comb_class;
1042         size_t i;
1043
1044         my_comb_class = comb_class[last];
1045         for (i = 1; i < last; i++)
1046                 if (comb_class[i] >= my_comb_class ||
1047                     comb_class[i] == U8_COMBINING_CLASS_STARTER)
1048                         return (B_TRUE);
1049
1050         return (B_FALSE);
1051 }
1052
1053 /*
1054  * The do_composition() reads the character string pointed by 's' and
1055  * do necessary canonical composition and then copy over the result back to
1056  * the 's'.
1057  *
1058  * The input argument 's' cannot contain more than 32 characters.
1059  */
1060 static size_t
1061 do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
1062         uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
1063 {
1064         uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
1065         uchar_t tc[U8_MB_CUR_MAX];
1066         uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
1067         size_t saved_marks_count;
1068         uchar_t *p;
1069         uchar_t *saved_p;
1070         uchar_t *q;
1071         size_t i;
1072         size_t saved_i;
1073         size_t j;
1074         size_t k;
1075         size_t l;
1076         size_t C;
1077         size_t saved_l;
1078         size_t size;
1079         uint32_t u1;
1080         uint32_t u2;
1081         boolean_t match_not_found = B_TRUE;
1082
1083         /*
1084          * This should never happen unless the callers are doing some strange
1085          * and unexpected things.
1086          *
1087          * The "last" is the index pointing to the last character not last + 1.
1088          */
1089         if (last >= U8_MAX_CHARS_A_SEQ)
1090                 last = U8_UPPER_LIMIT_IN_A_SEQ;
1091
1092         for (i = l = 0; i <= last; i++) {
1093                 /*
1094                  * The last or any non-Starters at the beginning, we don't
1095                  * have any chance to do composition and so we just copy them
1096                  * to the temporary buffer.
1097                  */
1098                 if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
1099 SAVE_THE_CHAR:
1100                         p = s + start[i];
1101                         size = disp[i];
1102                         for (k = 0; k < size; k++)
1103                                 t[l++] = *p++;
1104                         continue;
1105                 }
1106
1107                 /*
1108                  * If this could be a start of Hangul Jamos, then, we try to
1109                  * conjoin them.
1110                  */
1111                 if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
1112                         U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
1113                             s[start[i] + 1], s[start[i] + 2]);
1114                         U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
1115                             s[start[i] + 4], s[start[i] + 5]);
1116
1117                         if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
1118                                 u1 -= U8_HANGUL_JAMO_L_FIRST;
1119                                 u2 -= U8_HANGUL_JAMO_V_FIRST;
1120                                 u1 = U8_HANGUL_SYL_FIRST +
1121                                     (u1 * U8_HANGUL_V_COUNT + u2) *
1122                                     U8_HANGUL_T_COUNT;
1123
1124                                 i += 2;
1125                                 if (i <= last) {
1126                                         U8_PUT_3BYTES_INTO_UTF32(u2,
1127                                             s[start[i]], s[start[i] + 1],
1128                                             s[start[i] + 2]);
1129
1130                                         if (U8_HANGUL_JAMO_T(u2)) {
1131                                                 u1 += u2 -
1132                                                     U8_HANGUL_JAMO_T_FIRST;
1133                                                 i++;
1134                                         }
1135                                 }
1136
1137                                 U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
1138                                 i--;
1139                                 l += 3;
1140                                 continue;
1141                         }
1142                 }
1143
1144                 /*
1145                  * Let's then find out if this Starter has composition
1146                  * mapping.
1147                  */
1148                 p = find_composition_start(uv, s + start[i], disp[i]);
1149                 if (p == NULL)
1150                         goto SAVE_THE_CHAR;
1151
1152                 /*
1153                  * We have a Starter with composition mapping and the next
1154                  * character is a non-Starter. Let's try to find out if
1155                  * we can do composition.
1156                  */
1157
1158                 saved_p = p;
1159                 saved_i = i;
1160                 saved_l = l;
1161                 saved_marks_count = 0;
1162
1163 TRY_THE_NEXT_MARK:
1164                 q = s + start[++i];
1165                 size = disp[i];
1166
1167                 /*
1168                  * The next for() loop compares the non-Starter pointed by
1169                  * 'q' with the possible (joinable) characters pointed by 'p'.
1170                  *
1171                  * The composition final table entry pointed by the 'p'
1172                  * looks like the following:
1173                  *
1174                  * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1175                  * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
1176                  * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1177                  *
1178                  * where C is the count byte indicating the number of
1179                  * mapping pairs where each pair would be look like
1180                  * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
1181                  * character of a canonical decomposition and the B0-Bm are
1182                  * the bytes of a matching composite character. The F is
1183                  * a filler byte after each character as the separator.
1184                  */
1185
1186                 match_not_found = B_TRUE;
1187
1188                 for (C = *p++; C > 0; C--) {
1189                         for (k = 0; k < size; p++, k++)
1190                                 if (*p != q[k])
1191                                         break;
1192
1193                         /* Have we found it? */
1194                         if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
1195                                 match_not_found = B_FALSE;
1196
1197                                 l = saved_l;
1198
1199                                 while (*++p != U8_TBL_ELEMENT_FILLER)
1200                                         t[l++] = *p;
1201
1202                                 break;
1203                         }
1204
1205                         /* We didn't find; skip to the next pair. */
1206                         if (*p != U8_TBL_ELEMENT_FILLER)
1207                                 while (*++p != U8_TBL_ELEMENT_FILLER)
1208                                         ;
1209                         while (*++p != U8_TBL_ELEMENT_FILLER)
1210                                 ;
1211                         p++;
1212                 }
1213
1214                 /*
1215                  * If there was no match, we will need to save the combining
1216                  * mark for later appending. After that, if the next one
1217                  * is a non-Starter and not blocked, then, we try once
1218                  * again to do composition with the next non-Starter.
1219                  *
1220                  * If there was no match and this was a Starter, then,
1221                  * this is a new start.
1222                  *
1223                  * If there was a match and a composition done and we have
1224                  * more to check on, then, we retrieve a new composition final
1225                  * table entry for the composite and then try to do the
1226                  * composition again.
1227                  */
1228
1229                 if (match_not_found) {
1230                         if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
1231                                 i--;
1232                                 goto SAVE_THE_CHAR;
1233                         }
1234
1235                         saved_marks[saved_marks_count++] = i;
1236                 }
1237
1238                 if (saved_l == l) {
1239                         while (i < last) {
1240                                 if (blocked(comb_class, i + 1))
1241                                         saved_marks[saved_marks_count++] = ++i;
1242                                 else
1243                                         break;
1244                         }
1245                         if (i < last) {
1246                                 p = saved_p;
1247                                 goto TRY_THE_NEXT_MARK;
1248                         }
1249                 } else if (i < last) {
1250                         p = find_composition_start(uv, t + saved_l,
1251                             l - saved_l);
1252                         if (p != NULL) {
1253                                 saved_p = p;
1254                                 goto TRY_THE_NEXT_MARK;
1255                         }
1256                 }
1257
1258                 /*
1259                  * There is no more composition possible.
1260                  *
1261                  * If there was no composition what so ever then we copy
1262                  * over the original Starter and then append any non-Starters
1263                  * remaining at the target string sequentially after that.
1264                  */
1265
1266                 if (saved_l == l) {
1267                         p = s + start[saved_i];
1268                         size = disp[saved_i];
1269                         for (j = 0; j < size; j++)
1270                                 t[l++] = *p++;
1271                 }
1272
1273                 for (k = 0; k < saved_marks_count; k++) {
1274                         p = s + start[saved_marks[k]];
1275                         size = disp[saved_marks[k]];
1276                         for (j = 0; j < size; j++)
1277                                 t[l++] = *p++;
1278                 }
1279         }
1280
1281         /*
1282          * If the last character is a Starter and if we have a character
1283          * (possibly another Starter) that can be turned into a composite,
1284          * we do so and we do so until there is no more of composition
1285          * possible.
1286          */
1287         if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
1288                 p = *os;
1289                 saved_l = l - disp[last];
1290
1291                 while (p < oslast) {
1292                         size = u8_number_of_bytes[*p];
1293                         if (size <= 1 || (p + size) > oslast)
1294                                 break;
1295
1296                         saved_p = p;
1297
1298                         for (i = 0; i < size; i++)
1299                                 tc[i] = *p++;
1300
1301                         q = find_composition_start(uv, t + saved_l,
1302                             l - saved_l);
1303                         if (q == NULL) {
1304                                 p = saved_p;
1305                                 break;
1306                         }
1307
1308                         match_not_found = B_TRUE;
1309
1310                         for (C = *q++; C > 0; C--) {
1311                                 for (k = 0; k < size; q++, k++)
1312                                         if (*q != tc[k])
1313                                                 break;
1314
1315                                 if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
1316                                         match_not_found = B_FALSE;
1317
1318                                         l = saved_l;
1319
1320                                         while (*++q != U8_TBL_ELEMENT_FILLER) {
1321                                                 /*
1322                                                  * This is practically
1323                                                  * impossible but we don't
1324                                                  * want to take any chances.
1325                                                  */
1326                                                 if (l >=
1327                                                     U8_STREAM_SAFE_TEXT_MAX) {
1328                                                         p = saved_p;
1329                                                         goto SAFE_RETURN;
1330                                                 }
1331                                                 t[l++] = *q;
1332                                         }
1333
1334                                         break;
1335                                 }
1336
1337                                 if (*q != U8_TBL_ELEMENT_FILLER)
1338                                         while (*++q != U8_TBL_ELEMENT_FILLER)
1339                                                 ;
1340                                 while (*++q != U8_TBL_ELEMENT_FILLER)
1341                                         ;
1342                                 q++;
1343                         }
1344
1345                         if (match_not_found) {
1346                                 p = saved_p;
1347                                 break;
1348                         }
1349                 }
1350 SAFE_RETURN:
1351                 *os = p;
1352         }
1353
1354         /*
1355          * Now we copy over the temporary string to the target string.
1356          * Since composition always reduces the number of characters or
1357          * the number of characters stay, we don't need to worry about
1358          * the buffer overflow here.
1359          */
1360         for (i = 0; i < l; i++)
1361                 s[i] = t[i];
1362         s[l] = '\0';
1363
1364         return (l);
1365 }
1366
1367 /*
1368  * The collect_a_seq() function checks on the given string s, collect
1369  * a sequence of characters at u8s, and return the sequence. While it collects
1370  * a sequence, it also applies case conversion, canonical or compatibility
1371  * decomposition, canonical decomposition, or some or all of them and
1372  * in that order.
1373  *
1374  * The collected sequence cannot be bigger than 32 characters since if
1375  * it is having more than 31 characters, the sequence will be terminated
1376  * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
1377  * a Stream-Safe Text. The collected sequence is always terminated with
1378  * a null byte and the return value is the byte length of the sequence
1379  * including 0. The return value does not include the terminating
1380  * null byte.
1381  */
1382 static size_t
1383 collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
1384         boolean_t is_it_toupper,
1385         boolean_t is_it_tolower,
1386         boolean_t canonical_decomposition,
1387         boolean_t compatibility_decomposition,
1388         boolean_t canonical_composition,
1389         int *errno, u8_normalization_states_t *state)
1390 {
1391         uchar_t *s;
1392         int sz;
1393         int saved_sz;
1394         size_t i;
1395         size_t j;
1396         size_t k;
1397         size_t l;
1398         uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
1399         uchar_t disp[U8_MAX_CHARS_A_SEQ];
1400         uchar_t start[U8_MAX_CHARS_A_SEQ];
1401         uchar_t u8t[U8_MB_CUR_MAX];
1402         uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
1403         uchar_t tc;
1404         size_t last;
1405         size_t saved_last;
1406         uint32_t u1;
1407
1408         /*
1409          * Save the source string pointer which we will return a changed
1410          * pointer if we do processing.
1411          */
1412         s = *source;
1413
1414         /*
1415          * The following is a fallback for just in case callers are not
1416          * checking the string boundaries before the calling.
1417          */
1418         if (s >= slast) {
1419                 u8s[0] = '\0';
1420
1421                 return (0);
1422         }
1423
1424         /*
1425          * As the first thing, let's collect a character and do case
1426          * conversion if necessary.
1427          */
1428
1429         sz = u8_number_of_bytes[*s];
1430
1431         if (sz < 0) {
1432                 *errno = EILSEQ;
1433
1434                 u8s[0] = *s++;
1435                 u8s[1] = '\0';
1436
1437                 *source = s;
1438
1439                 return (1);
1440         }
1441
1442         if (sz == 1) {
1443                 if (is_it_toupper)
1444                         u8s[0] = U8_ASCII_TOUPPER(*s);
1445                 else if (is_it_tolower)
1446                         u8s[0] = U8_ASCII_TOLOWER(*s);
1447                 else
1448                         u8s[0] = *s;
1449                 s++;
1450                 u8s[1] = '\0';
1451         } else if ((s + sz) > slast) {
1452                 *errno = EINVAL;
1453
1454                 for (i = 0; s < slast; )
1455                         u8s[i++] = *s++;
1456                 u8s[i] = '\0';
1457
1458                 *source = s;
1459
1460                 return (i);
1461         } else {
1462                 if (is_it_toupper || is_it_tolower) {
1463                         i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
1464                         s += sz;
1465                         sz = i;
1466                 } else {
1467                         for (i = 0; i < sz; )
1468                                 u8s[i++] = *s++;
1469                         u8s[i] = '\0';
1470                 }
1471         }
1472
1473         /*
1474          * And then canonical/compatibility decomposition followed by
1475          * an optional canonical composition. Please be noted that
1476          * canonical composition is done only when a decomposition is
1477          * done.
1478          */
1479         if (canonical_decomposition || compatibility_decomposition) {
1480                 if (sz == 1) {
1481                         *state = U8_STATE_START;
1482
1483                         saved_sz = 1;
1484
1485                         comb_class[0] = 0;
1486                         start[0] = 0;
1487                         disp[0] = 1;
1488
1489                         last = 1;
1490                 } else {
1491                         saved_sz = do_decomp(uv, u8s, u8s, sz,
1492                             canonical_decomposition, state);
1493
1494                         last = 0;
1495
1496                         for (i = 0; i < saved_sz; ) {
1497                                 sz = u8_number_of_bytes[u8s[i]];
1498
1499                                 comb_class[last] = combining_class(uv,
1500                                     u8s + i, sz);
1501                                 start[last] = i;
1502                                 disp[last] = sz;
1503
1504                                 last++;
1505                                 i += sz;
1506                         }
1507
1508                         /*
1509                          * Decomposition yields various Hangul related
1510                          * states but not on combining marks. We need to
1511                          * find out at here by checking on the last
1512                          * character.
1513                          */
1514                         if (*state == U8_STATE_START) {
1515                                 if (comb_class[last - 1])
1516                                         *state = U8_STATE_COMBINING_MARK;
1517                         }
1518                 }
1519
1520                 saved_last = last;
1521
1522                 while (s < slast) {
1523                         sz = u8_number_of_bytes[*s];
1524
1525                         /*
1526                          * If this is an illegal character, an incomplete
1527                          * character, or an 7-bit ASCII Starter character,
1528                          * then we have collected a sequence; break and let
1529                          * the next call deal with the two cases.
1530                          *
1531                          * Note that this is okay only if you are using this
1532                          * function with a fixed length string, not on
1533                          * a buffer with multiple calls of one chunk at a time.
1534                          */
1535                         if (sz <= 1) {
1536                                 break;
1537                         } else if ((s + sz) > slast) {
1538                                 break;
1539                         } else {
1540                                 /*
1541                                  * If the previous character was a Hangul Jamo
1542                                  * and this character is a Hangul Jamo that
1543                                  * can be conjoined, we collect the Jamo.
1544                                  */
1545                                 if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
1546                                         U8_PUT_3BYTES_INTO_UTF32(u1,
1547                                             *s, *(s + 1), *(s + 2));
1548
1549                                         if (U8_HANGUL_COMPOSABLE_L_V(*state,
1550                                             u1)) {
1551                                                 i = 0;
1552                                                 *state = U8_STATE_HANGUL_LV;
1553                                                 goto COLLECT_A_HANGUL;
1554                                         }
1555
1556                                         if (U8_HANGUL_COMPOSABLE_LV_T(*state,
1557                                             u1)) {
1558                                                 i = 0;
1559                                                 *state = U8_STATE_HANGUL_LVT;
1560                                                 goto COLLECT_A_HANGUL;
1561                                         }
1562                                 }
1563
1564                                 /*
1565                                  * Regardless of whatever it was, if this is
1566                                  * a Starter, we don't collect the character
1567                                  * since that's a new start and we will deal
1568                                  * with it at the next time.
1569                                  */
1570                                 i = combining_class(uv, s, sz);
1571                                 if (i == U8_COMBINING_CLASS_STARTER)
1572                                         break;
1573
1574                                 /*
1575                                  * We know the current character is a combining
1576                                  * mark. If the previous character wasn't
1577                                  * a Starter (not Hangul) or a combining mark,
1578                                  * then, we don't collect this combining mark.
1579                                  */
1580                                 if (*state != U8_STATE_START &&
1581                                     *state != U8_STATE_COMBINING_MARK)
1582                                         break;
1583
1584                                 *state = U8_STATE_COMBINING_MARK;
1585 COLLECT_A_HANGUL:
1586                                 /*
1587                                  * If we collected a Starter and combining
1588                                  * marks up to 30, i.e., total 31 characters,
1589                                  * then, we terminate this degenerately long
1590                                  * combining sequence with a U+034F COMBINING
1591                                  * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
1592                                  * UTF-8 and turn this into a Stream-Safe
1593                                  * Text. This will be extremely rare but
1594                                  * possible.
1595                                  *
1596                                  * The following will also guarantee that
1597                                  * we are not writing more than 32 characters
1598                                  * plus a NULL at u8s[].
1599                                  */
1600                                 if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
1601 TURN_STREAM_SAFE:
1602                                         *state = U8_STATE_START;
1603                                         comb_class[last] = 0;
1604                                         start[last] = saved_sz;
1605                                         disp[last] = 2;
1606                                         last++;
1607
1608                                         u8s[saved_sz++] = 0xCD;
1609                                         u8s[saved_sz++] = 0x8F;
1610
1611                                         break;
1612                                 }
1613
1614                                 /*
1615                                  * Some combining marks also do decompose into
1616                                  * another combining mark or marks.
1617                                  */
1618                                 if (*state == U8_STATE_COMBINING_MARK) {
1619                                         k = last;
1620                                         l = sz;
1621                                         i = do_decomp(uv, uts, s, sz,
1622                                             canonical_decomposition, state);
1623                                         for (j = 0; j < i; ) {
1624                                                 sz = u8_number_of_bytes[uts[j]];
1625
1626                                                 comb_class[last] =
1627                                                     combining_class(uv,
1628                                                     uts + j, sz);
1629                                                 start[last] = saved_sz + j;
1630                                                 disp[last] = sz;
1631
1632                                                 last++;
1633                                                 if (last >=
1634                                                     U8_UPPER_LIMIT_IN_A_SEQ) {
1635                                                         last = k;
1636                                                         goto TURN_STREAM_SAFE;
1637                                                 }
1638                                                 j += sz;
1639                                         }
1640
1641                                         *state = U8_STATE_COMBINING_MARK;
1642                                         sz = i;
1643                                         s += l;
1644
1645                                         for (i = 0; i < sz; i++)
1646                                                 u8s[saved_sz++] = uts[i];
1647                                 } else {
1648                                         comb_class[last] = i;
1649                                         start[last] = saved_sz;
1650                                         disp[last] = sz;
1651                                         last++;
1652
1653                                         for (i = 0; i < sz; i++)
1654                                                 u8s[saved_sz++] = *s++;
1655                                 }
1656
1657                                 /*
1658                                  * If this is U+0345 COMBINING GREEK
1659                                  * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
1660                                  * iota subscript, and need to be converted to
1661                                  * uppercase letter, convert it to U+0399 GREEK
1662                                  * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
1663                                  * i.e., convert to capital adscript form as
1664                                  * specified in the Unicode standard.
1665                                  *
1666                                  * This is the only special case of (ambiguous)
1667                                  * case conversion at combining marks and
1668                                  * probably the standard will never have
1669                                  * anything similar like this in future.
1670                                  */
1671                                 if (is_it_toupper && sz >= 2 &&
1672                                     u8s[saved_sz - 2] == 0xCD &&
1673                                     u8s[saved_sz - 1] == 0x85) {
1674                                         u8s[saved_sz - 2] = 0xCE;
1675                                         u8s[saved_sz - 1] = 0x99;
1676                                 }
1677                         }
1678                 }
1679
1680                 /*
1681                  * Let's try to ensure a canonical ordering for the collected
1682                  * combining marks. We do this only if we have collected
1683                  * at least one more non-Starter. (The decomposition mapping
1684                  * data tables have fully (and recursively) expanded and
1685                  * canonically ordered decompositions.)
1686                  *
1687                  * The U8_SWAP_COMB_MARKS() convenience macro has some
1688                  * assumptions and we are meeting the assumptions.
1689                  */
1690                 last--;
1691                 if (last >= saved_last) {
1692                         for (i = 0; i < last; i++)
1693                                 for (j = last; j > i; j--)
1694                                         if (comb_class[j] &&
1695                                             comb_class[j - 1] > comb_class[j]) {
1696                                                 U8_SWAP_COMB_MARKS(j - 1, j);
1697                                         }
1698                 }
1699
1700                 *source = s;
1701
1702                 if (! canonical_composition) {
1703                         u8s[saved_sz] = '\0';
1704                         return (saved_sz);
1705                 }
1706
1707                 /*
1708                  * Now do the canonical composition. Note that we do this
1709                  * only after a canonical or compatibility decomposition to
1710                  * finish up NFC or NFKC.
1711                  */
1712                 sz = do_composition(uv, u8s, comb_class, start, disp, last,
1713                     &s, slast);
1714         }
1715
1716         *source = s;
1717
1718         return ((size_t)sz);
1719 }
1720
1721 /*
1722  * The do_norm_compare() function does string comparion based on Unicode
1723  * simple case mappings and Unicode Normalization definitions.
1724  *
1725  * It does so by collecting a sequence of character at a time and comparing
1726  * the collected sequences from the strings.
1727  *
1728  * The meanings on the return values are the same as the usual strcmp().
1729  */
1730 static int
1731 do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1732         int flag, int *errno)
1733 {
1734         int result;
1735         size_t sz1;
1736         size_t sz2;
1737         uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
1738         uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
1739         uchar_t *s1last;
1740         uchar_t *s2last;
1741         boolean_t is_it_toupper;
1742         boolean_t is_it_tolower;
1743         boolean_t canonical_decomposition;
1744         boolean_t compatibility_decomposition;
1745         boolean_t canonical_composition;
1746         u8_normalization_states_t state;
1747
1748         s1last = s1 + n1;
1749         s2last = s2 + n2;
1750
1751         is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1752         is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1753         canonical_decomposition = flag & U8_CANON_DECOMP;
1754         compatibility_decomposition = flag & U8_COMPAT_DECOMP;
1755         canonical_composition = flag & U8_CANON_COMP;
1756
1757         while (s1 < s1last && s2 < s2last) {
1758                 /*
1759                  * If the current character is a 7-bit ASCII and the last
1760                  * character, or, if the current character and the next
1761                  * character are both some 7-bit ASCII characters then
1762                  * we treat the current character as a sequence.
1763                  *
1764                  * In any other cases, we need to call collect_a_seq().
1765                  */
1766
1767                 if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
1768                     ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
1769                         if (is_it_toupper)
1770                                 u8s1[0] = U8_ASCII_TOUPPER(*s1);
1771                         else if (is_it_tolower)
1772                                 u8s1[0] = U8_ASCII_TOLOWER(*s1);
1773                         else
1774                                 u8s1[0] = *s1;
1775                         u8s1[1] = '\0';
1776                         sz1 = 1;
1777                         s1++;
1778                 } else {
1779                         state = U8_STATE_START;
1780                         sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
1781                             is_it_toupper, is_it_tolower,
1782                             canonical_decomposition,
1783                             compatibility_decomposition,
1784                             canonical_composition, errno, &state);
1785                 }
1786
1787                 if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
1788                     ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
1789                         if (is_it_toupper)
1790                                 u8s2[0] = U8_ASCII_TOUPPER(*s2);
1791                         else if (is_it_tolower)
1792                                 u8s2[0] = U8_ASCII_TOLOWER(*s2);
1793                         else
1794                                 u8s2[0] = *s2;
1795                         u8s2[1] = '\0';
1796                         sz2 = 1;
1797                         s2++;
1798                 } else {
1799                         state = U8_STATE_START;
1800                         sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
1801                             is_it_toupper, is_it_tolower,
1802                             canonical_decomposition,
1803                             compatibility_decomposition,
1804                             canonical_composition, errno, &state);
1805                 }
1806
1807                 /*
1808                  * Now compare the two characters. If they are the same,
1809                  * we move on to the next character sequences.
1810                  */
1811                 if (sz1 == 1 && sz2 == 1) {
1812                         if (*u8s1 > *u8s2)
1813                                 return (1);
1814                         if (*u8s1 < *u8s2)
1815                                 return (-1);
1816                 } else {
1817                         result = strcmp((const char *)u8s1, (const char *)u8s2);
1818                         if (result != 0)
1819                                 return (result);
1820                 }
1821         }
1822
1823         /*
1824          * We compared until the end of either or both strings.
1825          *
1826          * If we reached to or went over the ends for the both, that means
1827          * they are the same.
1828          *
1829          * If we reached only one end, that means the other string has
1830          * something which then can be used to determine the return value.
1831          */
1832         if (s1 >= s1last) {
1833                 if (s2 >= s2last)
1834                         return (0);
1835                 return (-1);
1836         }
1837         return (1);
1838 }
1839
1840 /*
1841  * The u8_strcmp() function compares two UTF-8 strings quite similar to
1842  * the strcmp(). For the comparison, however, Unicode Normalization specific
1843  * equivalency and Unicode simple case conversion mappings based equivalency
1844  * can be requested and checked against.
1845  */
1846 int
1847 u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1848                 int *errno)
1849 {
1850         int f;
1851         size_t n1;
1852         size_t n2;
1853
1854         *errno = 0;
1855
1856         /*
1857          * Check on the requested Unicode version, case conversion, and
1858          * normalization flag values.
1859          */
1860
1861         if (uv > U8_UNICODE_LATEST) {
1862                 *errno = ERANGE;
1863                 uv = U8_UNICODE_LATEST;
1864         }
1865
1866         if (flag == 0) {
1867                 flag = U8_STRCMP_CS;
1868         } else {
1869                 f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
1870                     U8_STRCMP_CI_LOWER);
1871                 if (f == 0) {
1872                         flag |= U8_STRCMP_CS;
1873                 } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
1874                     f != U8_STRCMP_CI_LOWER) {
1875                         *errno = EBADF;
1876                         flag = U8_STRCMP_CS;
1877                 }
1878
1879                 f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1880                 if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
1881                     f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1882                         *errno = EBADF;
1883                         flag = U8_STRCMP_CS;
1884                 }
1885         }
1886
1887         if (flag == U8_STRCMP_CS) {
1888                 return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
1889         }
1890
1891         n1 = strlen(s1);
1892         n2 = strlen(s2);
1893         if (n != 0) {
1894                 if (n < n1)
1895                         n1 = n;
1896                 if (n < n2)
1897                         n2 = n;
1898         }
1899
1900         /*
1901          * Simple case conversion can be done much faster and so we do
1902          * them separately here.
1903          */
1904         if (flag == U8_STRCMP_CI_UPPER) {
1905                 return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1906                     n1, n2, B_TRUE, errno));
1907         } else if (flag == U8_STRCMP_CI_LOWER) {
1908                 return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1909                     n1, n2, B_FALSE, errno));
1910         }
1911
1912         return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1913             flag, errno));
1914 }
1915
1916 size_t
1917 u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1918         int flag, size_t unicode_version, int *errno)
1919 {
1920         int f;
1921         int sz;
1922         uchar_t *ib;
1923         uchar_t *ibtail;
1924         uchar_t *ob;
1925         uchar_t *obtail;
1926         boolean_t do_not_ignore_null;
1927         boolean_t do_not_ignore_invalid;
1928         boolean_t is_it_toupper;
1929         boolean_t is_it_tolower;
1930         boolean_t canonical_decomposition;
1931         boolean_t compatibility_decomposition;
1932         boolean_t canonical_composition;
1933         size_t ret_val;
1934         size_t i;
1935         size_t j;
1936         uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
1937         u8_normalization_states_t state;
1938
1939         if (unicode_version > U8_UNICODE_LATEST) {
1940                 *errno = ERANGE;
1941                 return ((size_t)-1);
1942         }
1943
1944         f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
1945         if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1946                 *errno = EBADF;
1947                 return ((size_t)-1);
1948         }
1949
1950         f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1951         if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
1952             f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1953                 *errno = EBADF;
1954                 return ((size_t)-1);
1955         }
1956
1957         if (inarray == NULL || *inlen == 0)
1958                 return (0);
1959
1960         if (outarray == NULL) {
1961                 *errno = E2BIG;
1962                 return ((size_t)-1);
1963         }
1964
1965         ib = (uchar_t *)inarray;
1966         ob = (uchar_t *)outarray;
1967         ibtail = ib + *inlen;
1968         obtail = ob + *outlen;
1969
1970         do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
1971         do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
1972         is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1973         is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1974
1975         ret_val = 0;
1976
1977         /*
1978          * If we don't have a normalization flag set, we do the simple case
1979          * conversion based text preparation separately below. Text
1980          * preparation involving Normalization will be done in the false task
1981          * block, again, separately since it will take much more time and
1982          * resource than doing simple case conversions.
1983          */
1984         if (f == 0) {
1985                 while (ib < ibtail) {
1986                         if (*ib == '\0' && do_not_ignore_null)
1987                                 break;
1988
1989                         sz = u8_number_of_bytes[*ib];
1990
1991                         if (sz < 0) {
1992                                 if (do_not_ignore_invalid) {
1993                                         *errno = EILSEQ;
1994                                         ret_val = (size_t)-1;
1995                                         break;
1996                                 }
1997
1998                                 sz = 1;
1999                                 ret_val++;
2000                         }
2001
2002                         if (sz == 1) {
2003                                 if (ob >= obtail) {
2004                                         *errno = E2BIG;
2005                                         ret_val = (size_t)-1;
2006                                         break;
2007                                 }
2008
2009                                 if (is_it_toupper)
2010                                         *ob = U8_ASCII_TOUPPER(*ib);
2011                                 else if (is_it_tolower)
2012                                         *ob = U8_ASCII_TOLOWER(*ib);
2013                                 else
2014                                         *ob = *ib;
2015                                 ib++;
2016                                 ob++;
2017                         } else if ((ib + sz) > ibtail) {
2018                                 if (do_not_ignore_invalid) {
2019                                         *errno = EINVAL;
2020                                         ret_val = (size_t)-1;
2021                                         break;
2022                                 }
2023
2024                                 if ((obtail - ob) < (ibtail - ib)) {
2025                                         *errno = E2BIG;
2026                                         ret_val = (size_t)-1;
2027                                         break;
2028                                 }
2029
2030                                 /*
2031                                  * We treat the remaining incomplete character
2032                                  * bytes as a character.
2033                                  */
2034                                 ret_val++;
2035
2036                                 while (ib < ibtail)
2037                                         *ob++ = *ib++;
2038                         } else {
2039                                 if (is_it_toupper || is_it_tolower) {
2040                                         i = do_case_conv(unicode_version, u8s,
2041                                             ib, sz, is_it_toupper);
2042
2043                                         if ((obtail - ob) < i) {
2044                                                 *errno = E2BIG;
2045                                                 ret_val = (size_t)-1;
2046                                                 break;
2047                                         }
2048
2049                                         ib += sz;
2050
2051                                         for (sz = 0; sz < i; sz++)
2052                                                 *ob++ = u8s[sz];
2053                                 } else {
2054                                         if ((obtail - ob) < sz) {
2055                                                 *errno = E2BIG;
2056                                                 ret_val = (size_t)-1;
2057                                                 break;
2058                                         }
2059
2060                                         for (i = 0; i < sz; i++)
2061                                                 *ob++ = *ib++;
2062                                 }
2063                         }
2064                 }
2065         } else {
2066                 canonical_decomposition = flag & U8_CANON_DECOMP;
2067                 compatibility_decomposition = flag & U8_COMPAT_DECOMP;
2068                 canonical_composition = flag & U8_CANON_COMP;
2069
2070                 while (ib < ibtail) {
2071                         if (*ib == '\0' && do_not_ignore_null)
2072                                 break;
2073
2074                         /*
2075                          * If the current character is a 7-bit ASCII
2076                          * character and it is the last character, or,
2077                          * if the current character is a 7-bit ASCII
2078                          * character and the next character is also a 7-bit
2079                          * ASCII character, then, we copy over this
2080                          * character without going through collect_a_seq().
2081                          *
2082                          * In any other cases, we need to look further with
2083                          * the collect_a_seq() function.
2084                          */
2085                         if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
2086                             ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
2087                                 if (ob >= obtail) {
2088                                         *errno = E2BIG;
2089                                         ret_val = (size_t)-1;
2090                                         break;
2091                                 }
2092
2093                                 if (is_it_toupper)
2094                                         *ob = U8_ASCII_TOUPPER(*ib);
2095                                 else if (is_it_tolower)
2096                                         *ob = U8_ASCII_TOLOWER(*ib);
2097                                 else
2098                                         *ob = *ib;
2099                                 ib++;
2100                                 ob++;
2101                         } else {
2102                                 *errno = 0;
2103                                 state = U8_STATE_START;
2104
2105                                 j = collect_a_seq(unicode_version, u8s,
2106                                     &ib, ibtail,
2107                                     is_it_toupper,
2108                                     is_it_tolower,
2109                                     canonical_decomposition,
2110                                     compatibility_decomposition,
2111                                     canonical_composition,
2112                                     errno, &state);
2113
2114                                 if (*errno && do_not_ignore_invalid) {
2115                                         ret_val = (size_t)-1;
2116                                         break;
2117                                 }
2118
2119                                 if ((obtail - ob) < j) {
2120                                         *errno = E2BIG;
2121                                         ret_val = (size_t)-1;
2122                                         break;
2123                                 }
2124
2125                                 for (i = 0; i < j; i++)
2126                                         *ob++ = u8s[i];
2127                         }
2128                 }
2129         }
2130
2131         *inlen = ibtail - ib;
2132         *outlen = obtail - ob;
2133
2134         return (ret_val);
2135 }
2136
2137 #endif /* HAVE_UNICODE */