blob: bac7933fcabbd21662bc8801b692fc382240842f [file] [log] [blame]
Alexander Pyhalov16d86562018-11-21 12:34:20 +03001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1995, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27#include <stdio.h>
28#include <stdlib.h>
29#include <sys/types.h>
30#include <sys/isa_defs.h>
31#include <errno.h>
32#include "unicode_cns11643_TW.h"
33#include "common_defs.h"
34
35#define MSB 0x80 /* most significant bit */
36#define MBYTE 0x8e /* multi-byte (4 byte character) */
37#define PMASK 0xa0 /* plane number mask */
38#define ONEBYTE 0xff /* right most byte */
39
40#define NON_ID_CHAR '?' /* non-identified character */
41
42#define Low_UDA_In_Unicode 0xF0000
43#define High_UDA_In_Unicode 0xF8A10
44
45typedef struct _icv_state {
46 char keepc[6]; /* maximum # byte of UTF8 code */
47 short ustate;
48 int _errno; /* internal errno */
49 boolean little_endian;
50 boolean bom_written;
51} _iconv_st;
52
53enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 };
54
55static int get_plane_no_by_utf(uint_t, int *, unsigned long *);
56static int utf8_to_cns(int, int, unsigned long, char *, size_t, int *);
57static int binsearch(unsigned long, utf_cns[], int);
58
59/*
60 * Open; called from iconv_open()
61 */
62void *
63_icv_open()
64{
65 _iconv_st *st;
66
67 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
68 errno = ENOMEM;
69 return ((void *) -1);
70 }
71
72 st->ustate = U0;
73 st->_errno = 0;
74 st->little_endian = false;
75 st->bom_written = false;
76#if defined(UCS_2LE)
77 st->little_endian = true;
78 st->bom_written = true;
79#endif
80 return ((void *) st);
81}
82
83
84/*
85 * Close; called from iconv_close()
86 */
87void
88_icv_close(_iconv_st *st)
89{
90 if (!st)
91 errno = EBADF;
92 else
93 free(st);
94}
95
96
97/*
98 * Actual conversion; called from iconv()
99 */
100/*=========================================================
101 *
102 * State Machine for interpreting UTF8 code
103 *
104 *=========================================================
105 * 2nd byte 3rd byte 4th byte
106 * +----->------->------->U5----->U6----------->U7
107 * | |
108 * | 3 byte unicode |
109 * +----->------->-------+ |
110 * | | |
111 * ^ v |
112 * | 2 byte U2 ---> U3 |
113 * | unicode v |
114 * +------> U0 -------> U1 +-------->U4---+
115 * ^ ascii | | ^ |
116 * | | +-------->--------->--------+ |
117 * | v v
118 * +----<---+-----<------------<------------<------------+
119 *
120 *=========================================================*/
121size_t
122_icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
123 char **outbuf, size_t *outbytesleft)
124{
125 int plane_no, n, unidx;
126 unsigned long cnscode;
127 uint_t ucs;
128 int uconv_num = 0;
129 int utf8_len = 0;
130
131#ifdef DEBUG
132 fprintf(stderr, "========== iconv(): UTF2 --> CNS11643 ==========\n");
133#endif
134 if (st == NULL) {
135 errno = EBADF;
136 return ((size_t) -1);
137 }
138
139 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
140 st->ustate = U0;
141 st->_errno = 0;
142 return ((size_t) 0);
143 }
144
145 st->_errno = 0; /* reset internal errno */
146 errno = 0; /* reset external errno */
147
148 /* a state machine for interpreting UTF8 code */
149 while (*inbytesleft > 0 && *outbytesleft > 0) {
150
151 uchar_t first_byte;
152 int uconv_num_internal = 0;
153
154 switch (st->ustate) {
155 case U0: /* assuming ASCII in the beginning */
156 /*
157 * Code converion for UCS-2LE to support Samba
158 */
159 if (st->little_endian) {
160 st->ustate = U1;
161 st->keepc[0] = **inbuf;
162 }
163 else if ((**inbuf & MSB) == 0) { /* ASCII */
164 **outbuf = **inbuf;
165 (*outbuf)++;
166 (*outbytesleft)--;
167 } else { /* Chinese character 0xc2..0xdf */
168 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */
169
170 /* invalid sequence if the first char is either 0xc0 or 0xc1 */
171 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
172 st->_errno = errno = EILSEQ;
173 else {
174 st->ustate = U1;
175 st->keepc[0] = **inbuf;
176 }
177 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte 0xe0..0xef */
178 st->ustate = U2;
179 st->keepc[0] = **inbuf;
180 } else {
181 /* four bytes of UTF-8 sequences */
182 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
183 st->_errno = errno = EILSEQ;
184 else {
185 st->ustate = U5;
186 st->keepc[0] = **inbuf;
187 }
188 }
189 }
190 break;
191 case U1: /* 2 byte unicode */
192 if ((**inbuf & 0xc0) == 0x80 || st->little_endian) {
193 utf8_len = 2;
194 st->keepc[1] = **inbuf;
195
196 /*
197 * Code conversion for UCS-2LE to support Samba
198 */
199 if (st->little_endian) {
200 /*
201 * It's ASCII
202 */
203 if (st->keepc[1] == 0 && (st->keepc[0] & 0x80) == 0) {
204 *(*outbuf)++ = st->keepc[0];
205 (*outbytesleft)--;
206 st->ustate = U0;
207 break;
208 }
209
210 ucs = ((st->keepc[1] & 0xff)<< 8) | (st->keepc[0] & 0xff);
211
212 } else
213 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
214
215 st->ustate = U4;
216#ifdef DEBUG
217 fprintf(stderr, "UTF8: %02x%02x --> ",
218 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
219#endif
220 continue; /* should not advance *inbuf */
221 } else {
222 st->_errno = errno = EILSEQ;
223 }
224 break;
225 case U2: /* 3 byte unicode - 2nd byte */
226
227 first_byte = st->keepc[0];
228
229 /* if the first byte is 0xed, it is illegal sequence if the second
230 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
231 */
232 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
233 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
234 st->_errno = errno = EILSEQ;
235 else {
236 st->ustate = U3;
237 st->keepc[1] = **inbuf;
238 }
239 break;
240 case U3: /* 3 byte unicode - 3rd byte */
241 if ((**inbuf & 0xc0) == 0x80) {
242 st->ustate = U4;
243 utf8_len = 3;
244 st->keepc[2] = **inbuf;
245
246 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
247#ifdef DEBUG
248 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
249 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
250#endif
251 continue; /* should not advance *inbuf */
252 } else {
253 st->_errno = errno = EILSEQ;
254 }
255 break;
256 case U4:
257
258 /* 0xfffe and 0xffff should not be allowed */
259 if ( ucs == 0xFFFE || ucs == 0xFFFF ) {
260 st->_errno = errno = EILSEQ;
261 break;
262 }
263
264 plane_no = get_plane_no_by_utf(ucs, &unidx, &cnscode);
265
266/* comment these lines to ignore the invalid CNS
267 if (plane_no < 0) {
268 st->_errno = errno = EILSEQ;
269 break;
270 }
271*/
272
273 n = utf8_to_cns(plane_no, unidx, cnscode,
274 *outbuf, *outbytesleft, &uconv_num_internal);
275 if (n > 0) {
276 (*outbuf) += n;
277 (*outbytesleft) -= n;
278
279 uconv_num += uconv_num_internal;
280
281 st->ustate = U0;
282 } else {
283 st->_errno = errno = E2BIG;
284 }
285 break;
286 case U5:
287
288 first_byte = st->keepc[0];
289
290 /* if the first byte is 0xf0, it is illegal sequence if
291 * the second one is between 0x80 and 0x8f
292 * for Four-Byte UTF: U+10000..U+10FFFF
293 */
294 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
295 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
296 st->_errno = errno = EILSEQ;
297 else {
298 st->ustate = U6;
299 st->keepc[1] = **inbuf;
300 }
301 break;
302 case U6:
303 if ((**inbuf & 0xc0) == 0x80) /* 0x80..0xbf */
304 {
305 st->ustate = U7;
306 st->keepc[2] = **inbuf;
307 }
308 else
309 st->_errno = errno = EILSEQ;
310 break;
311 case U7:
312 if ((**inbuf & 0xc0) == 0x80) /* 0x80..0xbf */
313 {
314 st->keepc[3] = **inbuf;
315 utf8_len = 4;
316
317 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
318 st->ustate = U4;
319
320 continue;
321 }
322 else
323 st->_errno = errno = EILSEQ;
324 break;
325 default: /* should never come here */
326 st->_errno = errno = EILSEQ;
327 st->ustate = U0; /* reset state */
328 break;
329 }
330
331 if (st->_errno) {
332#ifdef DEBUG
333 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
334 st->_errno, st->ustate);
335#endif
336 break;
337 }
338
339 (*inbuf)++;
340 (*inbytesleft)--;
341 }
342
343 if (*inbytesleft == 0 && st->ustate != U0)
344 errno = EINVAL;
345
346
347 if (*inbytesleft > 0 && *outbytesleft == 0)
348 errno = E2BIG;
349
350 if (errno) {
351 int num_reversed_bytes = 0;
352
353 switch (st->ustate) {
354 case U1:
355 num_reversed_bytes = 1;
356 break;
357 case U2:
358 num_reversed_bytes = 1;
359 break;
360 case U3:
361 num_reversed_bytes = 2;
362 break;
363 case U4:
364 num_reversed_bytes = utf8_len - 1;
365 break;
366 case U5:
367 num_reversed_bytes = 1;
368 break;
369 case U6:
370 num_reversed_bytes = 2;
371 break;
372 case U7:
373 num_reversed_bytes = 3;
374 break;
375 }
376
377 /*
378 * if error, *inbuf points to the byte following the last byte
379 * successfully used in the conversion.
380 */
381 *inbuf -= num_reversed_bytes;
382 *inbytesleft += num_reversed_bytes;
383 st->ustate = U0;
384 return ((size_t) -1);
385 }
386
387 return uconv_num;
388}
389
390/*
391 * Get plane number by UTF8 code; i.e. plane #1 returns 1, #2 returns 2, etc.
392 * Returns -1 on error conditions
393 *
394 * Since binary search of the UTF8 to CNS table is necessary, might as well
395 * return index and CNS code matching to the unicode.
396 */
397static int get_plane_no_by_utf(uint_t unicode,
398 int *unidx, unsigned long *cnscode)
399{
Toomas Soomea7fb1da2019-01-28 09:59:47 +0200400 int ret;
Alexander Pyhalov16d86562018-11-21 12:34:20 +0300401
402 /* test whether it belongs to private Unicode plane 15 */
403 if (unicode >= Low_UDA_In_Unicode && unicode <= High_UDA_In_Unicode)
404 {
405 uint_t internIdx = (uint_t)(unicode - Low_UDA_In_Unicode);
406 uchar_t byte1, byte2;
407
408 byte1 = 0xa1 + (internIdx % 8836) / 94;
409 byte2 = 0xa1 + internIdx % 94;
410 *cnscode = ((byte1 << 8) & 0xff00) | (byte2 & 0xff);
411
412 *unidx = 1; /* deceit the utf8_to_cns() */
413
414 ret = 12 + internIdx / 8836;
415 /* actually it belongs to CNS plane 16, so change it */
416 if ( ret == 15 ) ++ret;
417
418 return ret;
419 }
420
421
422 *unidx = binsearch(unicode, utf_cns_tab, MAX_UTF_NUM);
423 if ((*unidx) >= 0)
424 *cnscode = utf_cns_tab[*unidx].cnscode;
425 else
426 return(0); /* match from UTF8 to CNS not found */
427#ifdef DEBUG
428 fprintf(stderr, "Unicode=%04x, idx=%5d, CNS=%x ", unicode, *unidx, *cnscode);
429#endif
430
431 ret = (int) (*cnscode >> 16);
432 switch (ret) {
433 case 0x21: /* 0x8EA1 - G */
434 case 0x22: /* 0x8EA2 - H */
435 case 0x23: /* 0x8EA3 - I */
436 case 0x24: /* 0x8EA4 - J */
437 case 0x25: /* 0x8EA5 - K */
438 case 0x26: /* 0x8EA6 - L */
439 case 0x27: /* 0x8EA7 - M */
440 case 0x28: /* 0x8EA8 - N */
441 case 0x29: /* 0x8EA9 - O */
442 case 0x2a: /* 0x8EAA - P */
443 case 0x2b: /* 0x8EAB - Q */
444 case 0x2c: /* 0x8EAC - R */
445 case 0x2d: /* 0x8EAD - S */
446 case 0x2e: /* 0x8EAE - T */
447 case 0x2f: /* 0x8EAF - U */
448 case 0x30: /* 0x8EB0 - V */
449 return (ret - 0x20); /* so that we can use GET_PLANEC() */
450 default:
451 return (-1);
452 }
453}
454
455
456/*
457 * ISO/IEC 10646 (Unicode) --> ISO 2022-7
458 * Unicode --> UTF8 (FSS-UTF)
459 * (File System Safe Universal Character Set Transformation Format)
460 * Return: > 0 - converted with enough space in output buffer
461 * = 0 - no space in outbuf
462 */
463static int utf8_to_cns(int plane_no, int unidx, unsigned long cnscode,
464 char *buf, size_t buflen, int *uconv_num)
465{
466 unsigned long val; /* CNS 11643 value */
467 unsigned char c1 = 0, c2 = 0, cns_str[5];
468 int ret_size;
469
470 if (unidx < 0) { /* no match from UTF8 to CNS 11643 */
471 if ( buflen < 2 ) goto err;
472 *buf = *(buf+1) = NON_ID_CHAR;
473
474 /* non-identical conversion */
475 *uconv_num = 1;
476
477 ret_size = 2;
478 } else {
479 val = cnscode & 0xffff;
480 c1 = ((val & 0xff00) >> 8) | MSB;
481 c2 = (val & 0xff) | MSB;
482 }
483
484 switch (plane_no) {
485 case 1:
486 if ( buflen < 2) goto err;
487 *buf = cns_str[0] = c1;
488 *(buf+1) = cns_str[1] = c2;
Toomas Soomea7fb1da2019-01-28 09:59:47 +0200489 cns_str[2] = cns_str[3] = cns_str[4] = '\0';
Alexander Pyhalov16d86562018-11-21 12:34:20 +0300490 ret_size = 2;
491 break;
492 case 2:
493 case 3:
494 case 4:
495 case 5:
496 case 6:
497 case 7:
498 case 8:
499 case 9:
500 case 10:
501 case 11:
502 case 12:
503 case 13:
504 case 14:
505 case 15:
506 case 16:
507 if ( buflen < 4) goto err;
508 *(unsigned char*) buf = cns_str[0] = MBYTE;
509 *(buf+1) = cns_str[1] = PMASK + plane_no;
510 *(buf+2) = cns_str[2] = c1;
511 *(buf+3) = cns_str[3] = c2;
Toomas Soomea7fb1da2019-01-28 09:59:47 +0200512 cns_str[4] = '\0';
Alexander Pyhalov16d86562018-11-21 12:34:20 +0300513 ret_size = 4;
514 break;
515 }
516
517#ifdef DEBUG
518 fprintf(stderr, "\t#%d ->%s<-\n", plane_no, cns_str);
519#endif
520
521 return(ret_size);
522
523err:
524 errno = E2BIG;
525 return 0;
526}
527
528
529/* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
530static int binsearch(unsigned long x, utf_cns v[], int n)
531{
532 int low, high, mid;
533
534 low = 0;
535 high = n - 1;
536 while (low <= high) {
537 mid = (low + high) / 2;
538 if (x < v[mid].unicode)
539 high = mid - 1;
540 else if (x > v[mid].unicode)
541 low = mid + 1;
542 else /* found match */
543 return mid;
544 }
545 return (-1); /* no match */
546}