blob: 4c9f0ac6be71dc14080e727fc24f4e0d3ceaf5b3 [file] [log] [blame]
Alexander Pyhalov16d86562018-11-21 12:34:20 +03001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1995, by Sun Microsystems, Inc.
23 * All rights reserved.
24 */
25
26#include <stdio.h>
27#include <stdlib.h>
28#include <sys/types.h>
29#include <sys/isa_defs.h>
30#include <errno.h>
31#include "common_defs.h"
32#include "cns11643_unicode_TW.h" /* CNS 11643 to UTF8 mapping table */
33
34#define MSB 0x80 /* most significant bit */
35#define MBYTE 0x8e /* multi-byte (4 byte character) */
36#define PMASK 0xa0 /* plane number mask */
37#define ONEBYTE 0xff /* right most byte */
38#define MSB_OFF 0x7f /* mask off MBS */
39#define VALID_EUC_BYTE(v) (((uchar_t)v) >= 0xA1 && ((uchar_t)v) <= 0xFE)
40
41/* non-identified character */
42#define UTF8_NON_ID_CHAR1 0xEF
43#define UTF8_NON_ID_CHAR2 0xBF
44#define UTF8_NON_ID_CHAR3 0xBD
45
46
47typedef struct _icv_state {
48 char keepc[4]; /* maximum # byte of CNS11643 code */
49 short cstate; /* state machine id */
50 int _errno; /* internal errno */
51 boolean little_endian;
52 boolean bom_written;
53} _iconv_st;
54
55enum _CSTATE { C0, C1, C2, C3 };
56
57static int get_plane_no_by_char(const char);
58static int cns_to_utf8(int, _iconv_st *, char*, size_t, int *);
59static int binsearch(unsigned long, cns_utf[], int);
60static uint_t getUnicodeFromUDA(int, uchar_t, uchar_t);
61
62
63/*
64 * Open; called from iconv_open()
65 */
66void *
67_icv_open()
68{
69 _iconv_st *st;
70
71 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
72 errno = ENOMEM;
73 return ((void *) -1);
74 }
75
76 st->cstate = C0;
77 st->_errno = 0;
78 st->little_endian = false;
79 st->bom_written = false;
80#if defined(UCS_2LE)
81 st->little_endian = true;
82 st->bom_written = true;
83#endif
84 return ((void *) st);
85}
86
87
88/*
89 * Close; called from iconv_close()
90 */
91void
92_icv_close(_iconv_st *st)
93{
94 if (!st)
95 errno = EBADF;
96 else
97 free(st);
98}
99
100
101/*
102 * Actual conversion; called from iconv()
103 */
104/*=======================================================
105 *
106 * State Machine for interpreting CNS 11643 code
107 *
108 *=======================================================
109 *
110 * plane 2 - 16
111 * 1st C 2nd C 3rd C
112 * +------> C0 -----> C1 -----------> C2 -----> C3
113 * | ascii | plane 1 | 4th C |
114 * ^ v 2nd C v v
115 * +----<---+-----<----+-------<---------<-------+
116 *
117 *=======================================================*/
118size_t
119_icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
120 char **outbuf, size_t *outbytesleft)
121{
122 int plane_no = 0, n;
123 int uconv_num = 0;
124
125#ifdef DEBUG
126 fprintf(stderr, "========== iconv(): CNS11643 --> UTF2 ==========\n");
127#endif
128 if (st == NULL) {
129 errno = EBADF;
130 return ((size_t) -1);
131 }
132
133 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
134 st->cstate = C0;
135 st->_errno = 0;
136 return ((size_t) 0);
137 }
138
139 st->_errno = 0; /* reset internal errno */
140 errno = 0; /* reset external errno */
141
142 /* a state machine for interpreting CNS 11643 code */
143 while (*inbytesleft > 0 && *outbytesleft > 0) {
144 switch (st->cstate) {
145 case C0: /* assuming ASCII in the beginning */
146 if (**inbuf & MSB) {
147 if (((uchar_t)**inbuf) == MBYTE || VALID_EUC_BYTE(**inbuf)) {
148 st->keepc[0] = (**inbuf);
149 st->cstate = C1;
150 } else
151 st->_errno = errno = EILSEQ;
152 } else { /* real ASCII */
153 /*
154 * Code conversion for UCS-2LE to support Samba
155 */
156 if (st->little_endian) {
157 if (!st->bom_written) {
158 if (*outbytesleft < 4)
159 errno = E2BIG;
160 else {
161 *(*outbuf)++ = (uchar_t)0xff;
162 *(*outbuf)++ = (uchar_t)0xfe;
163 *outbytesleft -= 2;
164
165 st->bom_written = true;
166 }
167 }
168
169 if (*outbytesleft < 2)
170 errno = E2BIG;
171 else {
172 *(*outbuf)++ = **inbuf;
173 *(*outbuf)++ = (uchar_t)0x0;
174 *outbytesleft -= 2;
175 }
176 } else {
177 **outbuf = **inbuf;
178 (*outbuf)++;
179 (*outbytesleft)--;
180 }
181 }
182 break;
183 case C1: /* Chinese characters: 2nd byte */
184 if (((uchar_t)st->keepc[0]) == MBYTE) {
185 plane_no = get_plane_no_by_char(**inbuf);
186 if (plane_no == -1) { /* illegal plane */
187 st->_errno = errno = EILSEQ;
188 } else {
189 st->keepc[1] = (**inbuf);
190 st->cstate = C2;
191 }
192 } else {
193 if (VALID_EUC_BYTE(**inbuf)) { /* plane #1 */
194 int uconv_num_internal = 0;
195
196 st->keepc[1] = (**inbuf);
Toomas Soomea7fb1da2019-01-28 09:59:47 +0200197 st->keepc[2] = st->keepc[3] = '\0';
Alexander Pyhalov16d86562018-11-21 12:34:20 +0300198 n = cns_to_utf8(1, st, *outbuf,
199 *outbytesleft, &uconv_num_internal);
200 if (n > 0) {
201 (*outbuf) += n;
202 (*outbytesleft) -= n;
203
204 uconv_num += uconv_num_internal;
205
206 st->cstate = C0;
207 } else { /* don't reset state */
208 st->_errno = errno = E2BIG;
209 }
210 } else { /* input char doesn't belong
211 * to the input code set
212 */
213 st->_errno = errno = EILSEQ;
214 }
215 }
216 break;
217 case C2: /* plane #2 - #16 (4 bytes): get 3nd byte */
218 if (VALID_EUC_BYTE(**inbuf)) { /* 3rd byte */
219 st->keepc[2] = (**inbuf);
220 st->cstate = C3;
221 } else {
222 st->_errno = errno = EILSEQ;
223 }
224 break;
225 case C3: /* plane #2 - #16 (4 bytes): get 4th byte */
226 if (VALID_EUC_BYTE(**inbuf)) { /* 4th byte */
227 int uconv_num_internal = 0;
228
229 st->keepc[3] = (**inbuf);
230 n = cns_to_utf8(plane_no, st, *outbuf,
231 *outbytesleft, &uconv_num_internal);
232 if (n > 0) {
233 (*outbuf) += n;
234 (*outbytesleft) -= n;
235
236 uconv_num += uconv_num_internal;
237
238 st->cstate = C0; /* reset state */
239 } else { /* don't reset state */
240 st->_errno = errno = E2BIG;
241 }
242 } else {
243 st->_errno = errno = EILSEQ;
244 }
245 break;
246 default: /* should never come here */
247 st->_errno = errno = EILSEQ;
248 st->cstate = C0; /* reset state */
249 break;
250 }
251
252 if (st->_errno) {
253#ifdef DEBUG
254 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\n",
255 st->_errno, st->cstate);
256#endif
257 break;
258 }
259
260 (*inbuf)++;
261 (*inbytesleft)--;
262 }
263
264 if (*inbytesleft == 0 && st->cstate != C0)
265 errno = EINVAL;
266
267 if (*inbytesleft > 0 && *outbytesleft == 0)
268 errno = E2BIG;
269
270 if (errno) {
271 /*
272 * if error, *inbuf points to the byte following the last byte
273 * successfully used in the conversion.
274 */
275 *inbuf -= (st->cstate - C0);
276 *inbytesleft += (st->cstate - C0);
277 st->cstate = C0;
278 return ((size_t) -1);
279 }
280
281 return uconv_num;
282}
283
284
285/*
286 * Get plane number by char; i.e. 0xa2 returns 2, 0xae returns 14, etc.
287 * Returns -1 on error conditions
288 */
289static int get_plane_no_by_char(const char inbuf)
290{
291 int ret;
292 unsigned char uc = (unsigned char) inbuf;
293
294 ret = uc - PMASK;
295 switch (ret) {
296 case 1: /* 0x8EA1 */
297 case 2: /* 0x8EA2 */
298 case 3: /* 0x8EA3 */
299 case 4: /* 0x8EA4 */
300 case 5: /* 0x8EA5 */
301 case 6: /* 0x8EA6 */
302 case 7: /* 0x8EA7 */
303 case 12: /* 0x8EAC */
304 case 13: /* 0x8EAD */
305 case 14: /* 0x8EAE */
306 case 15: /* 0x8EAF */
307 case 16: /* 0x8EB0 */
308 return (ret);
309 default:
310 return (-1);
311 }
312}
313
314
315/*
316 * CNS 11643 code --> ISO/IEC 10646 (Unicode)
317 * Unicode --> UTF8 (FSS-UTF)
318 * (File System Safe Universal Character Set Transformation Format)
319 * Return: > 0 - converted with enough space in output buffer
320 * = 0 - no space in outbuf
321 */
322static int cns_to_utf8(int plane_no, _iconv_st *st, char *buf, size_t buflen, int *uconv_num)
323{
324 char cns_str[3];
325 unsigned long cns_val; /* MSB mask off CNS 11643 value */
326 int unidx; /* Unicode index */
327 unsigned long uni_val = 0; /* Unicode */
328 char *keepc = st->keepc;
329
330#ifdef DEBUG
331 fprintf(stderr, "%s %d ", keepc, plane_no);
332#endif
333 if (plane_no == 1) {
334 cns_str[0] = keepc[0] & MSB_OFF;
335 cns_str[1] = keepc[1] & MSB_OFF;
336 } else {
337 cns_str[0] = keepc[2] & MSB_OFF;
338 cns_str[1] = keepc[3] & MSB_OFF;
339 }
340 cns_val = (cns_str[0] << 8) + cns_str[1];
341#ifdef DEBUG
342 fprintf(stderr, "%x\t", cns_val);
343#endif
344
345 switch (plane_no) {
346 case 1:
347 unidx = binsearch(cns_val, cns1_utf_tab, MAX_CNS1_NUM);
348 if (unidx >= 0)
349 uni_val = cns1_utf_tab[unidx].unicode;
350 break;
351 case 2:
352 unidx = binsearch(cns_val, cns2_utf_tab, MAX_CNS2_NUM);
353 if (unidx >= 0)
354 uni_val = cns2_utf_tab[unidx].unicode;
355 break;
356 case 3:
357 unidx = binsearch(cns_val, cns3_utf_tab, MAX_CNS3_NUM);
358 if (unidx >= 0)
359 uni_val = cns3_utf_tab[unidx].unicode;
360 break;
361 case 4:
362 unidx = binsearch(cns_val, cns4_utf_tab, MAX_CNS4_NUM);
363 if (unidx >= 0)
364 uni_val = cns4_utf_tab[unidx].unicode;
365 break;
366 case 5:
367 unidx = binsearch(cns_val, cns5_utf_tab, MAX_CNS5_NUM);
368 if (unidx >= 0)
369 uni_val = cns5_utf_tab[unidx].unicode;
370 break;
371 case 6:
372 unidx = binsearch(cns_val, cns6_utf_tab, MAX_CNS6_NUM);
373 if (unidx >= 0)
374 uni_val = cns6_utf_tab[unidx].unicode;
375 break;
376 case 7:
377 unidx = binsearch(cns_val, cns7_utf_tab, MAX_CNS7_NUM);
378 if (unidx >= 0)
379 uni_val = cns7_utf_tab[unidx].unicode;
380 break;
381 case 12:
382 case 13:
383 case 14:
384 case 16:
385 uni_val = getUnicodeFromUDA(plane_no, (uchar_t)keepc[2], (uchar_t)keepc[3]);
386 unidx = 1; /* deceit the following if statement */
387 break;
388 case 15:
389 unidx = binsearch(cns_val, cns15_utf_tab, MAX_CNS15_NUM);
390 if (unidx >= 0)
391 uni_val = cns15_utf_tab[unidx].unicode;
392 break;
393 default:
394 unidx = -1; /* no mapping from CNS to UTF8 */
395 break;
396 }
397
398#ifdef DEBUG
399 fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val);
400#endif
401
402 /*
403 * Code version for UCS-2LE to support Samba
404 */
405 if (st->little_endian) {
406 int size = 0;
407
408 if (unidx < 0 || uni_val > 0x00ffff ) {
409 uni_val = ICV_CHAR_UCS2_REPLACEMENT;
410 *uconv_num = 1;
411 }
412
413 if (!st->bom_written) {
414 if (buflen < 4)
415 return 0;
416
417 *(buf + size++) = (uchar_t)0xff;
418 *(buf + size++) = (uchar_t)0xfe;
419 st->bom_written = true;
420 }
421
422 if (buflen < 2)
423 return 0;
424
425 *(buf + size++) = (uchar_t)(uni_val & 0xff);
426 *(buf + size++) = (uchar_t)((uni_val >> 8) & 0xff);
427
428 return size;
429 }
430
431 if (unidx >= 0) { /* do Unicode to UTF8 conversion */
432 if (uni_val >= 0x0080 && uni_val <= 0x07ff) {
433 if (buflen < 2) {
434#ifdef DEBUG
435 fprintf(stderr, "outbuf overflow in cns_to_utf8()!!\n");
436#endif
437 errno = E2BIG;
438 return(0);
439 }
440 *buf = (char)((uni_val >> 6) & 0x1f) | 0xc0;
441 *(buf+1) = (char)(uni_val & 0x3f) | 0x80;
442#ifdef DEBUG
443 fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE);
444#endif
445 return(2);
446 }
447 if (uni_val >= 0x0800 && uni_val <= 0xffff) {
448 if (buflen < 3) {
449#ifdef DEBUG
450 fprintf(stderr, "outbuf overflow in cns_to_utf8()!!\n");
451#endif
452 errno = E2BIG;
453 return(0);
454 }
455 *buf = (char)((uni_val >> 12) & 0xf) | 0xe0;
456 *(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80;
457 *(buf+2) = (char)(uni_val & 0x3f) | 0x80;
458#ifdef DEBUG
459 fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE);
460#endif
461 return(3);
462 }
463 if (uni_val >= 0x10000 && uni_val <= 0x10ffff) {
464 if (buflen < 4) {
465 errno = E2BIG;
466 return(0);
467 }
468
469 *buf = (char)((uni_val >> 18) & 0x7) | 0xf0;
470 *(buf+1) = (char)((uni_val >> 12) & 0x3f) | 0x80;
471 *(buf+2) = (char)((uni_val >>6) & 0x3f) | 0x80;
472 *(buf+3) = (char)(uni_val & 0x3f) | 0x80;
473 return(4);
474 }
475 }
476
477 /* can't find a match in CNS --> UTF8 table or illegal UTF8 code */
478 if (buflen < 3) {
479#ifdef DEBUG
480 fprintf(stderr, "outbuf overflow in cns_to_utf8()!!\n");
481#endif
482 errno = E2BIG;
483 return(0);
484 }
485
486 *(unsigned char*) buf = UTF8_NON_ID_CHAR1;
487 *(unsigned char*) (buf+1) = UTF8_NON_ID_CHAR2;
488 *(unsigned char*) (buf+2) = UTF8_NON_ID_CHAR3;
489
490 /* non-identical conversion */
491 *uconv_num = 1;
492
493#ifdef DEBUG
494 fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2));
495#endif
496 return(3);
497}
498
499static uint_t
500getUnicodeFromUDA(int plane_no, uchar_t byte1, uchar_t byte2)
501{
502 uint_t ucs4, disp;
503
504 /* compact into consecutive Unicode value for CNS plane 16 */
505 if ( plane_no == 16 ) --plane_no;
506
507 disp = (plane_no - 12) * 8836 + (byte1 - 0xA1) * 94 + ( byte2 - 0xA1);
508 return (ucs4 = (0xf << 16) | (disp & 0xffff));
509}
510
511/* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
512static int binsearch(unsigned long x, cns_utf v[], int n)
513{
514 int low, high, mid;
515
516 low = 0;
517 high = n - 1;
518 while (low <= high) {
519 mid = (low + high) / 2;
520 if (x < v[mid].cnscode)
521 high = mid - 1;
522 else if (x > v[mid].cnscode)
523 low = mid + 1;
524 else /* found match */
525 return mid;
526 }
527 return (-1); /* no match */
528}