blob: ae63dacb2ecef30a46825953304d4fc1e5892da0 [file] [log] [blame]
Alexander Pyhalov16d86562018-11-21 12:34:20 +03001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1995, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27#include <stdio.h>
28#include <stdlib.h>
29#include <sys/types.h>
30#include <sys/isa_defs.h>
31#include <errno.h>
32#include "unicode_big5.h" /* UTF8 to Big-5 mapping table */
33#include "common_defs.h"
34
35#define MSB 0x80 /* most significant bit */
36#define ONEBYTE 0xff /* right most byte */
37
38#define NON_ID_CHAR '?' /* non-identified character */
39
40typedef struct _icv_state {
41 char keepc[6]; /* maximum # byte of UTF8 code */
42 short ustate;
43 int _errno; /* internal errno */
44 boolean little_endian;
45 boolean bom_written;
46} _iconv_st;
47
48enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 };
49
50static int get_big5_by_utf(uint_t, int *, unsigned long *);
51static int utf8_to_big5(int, unsigned long, char *, size_t, int *);
52static int binsearch(unsigned long, utf_big5[], int);
53
54
55/*
56 * Open; called from iconv_open()
57 */
58void *
59_icv_open()
60{
61 _iconv_st *st;
62
63 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
64 errno = ENOMEM;
65 return ((void *) -1);
66 }
67
68 st->ustate = U0;
69 st->_errno = 0;
70 st->little_endian = false;
71 st->bom_written = false;
72#if defined(UCS_2LE)
73 st->little_endian = true;
74 st->bom_written = true;
75#endif
76 return ((void *) st);
77}
78
79
80/*
81 * Close; called from iconv_close()
82 */
83void
84_icv_close(_iconv_st *st)
85{
86 if (!st)
87 errno = EBADF;
88 else
89 free(st);
90}
91
92
93/*
94 * Actual conversion; called from iconv()
95 */
96/*=========================================================
97 *
98 * State Machine for interpreting UTF8 code
99 *
100 *=========================================================
101 * 2nd byte 3rd byte 4th byte
102 * +----->------->------>U5----->U6------------>U7
103 * | |
104 * | 3 byte unicode |
105 * +----->------->-------+ |
106 * | | |
107 * ^ v |
108 * | 2 byte U2 ---> U3 |
109 * | unicode v |
110 * +------> U0 -------> U1 +-------->U4---+
111 * ^ ascii | | ^ |
112 * | | +-------->--------->--------+ |
113 * | v v
114 * +----<---+-----<------------<------------<------------+
115 *
116 *=========================================================*/
117size_t
118_icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
119 char **outbuf, size_t *outbytesleft)
120{
121 int n, unidx;
122 unsigned long big5code;
123 int uconv_num = 0;
124 int utf8_len = 0;
125 uint_t ucs;
126
127#ifdef DEBUG
128 fprintf(stderr, "========== iconv(): UTF2 --> Big-5 ==========\n");
129#endif
130 if (st == NULL) {
131 errno = EBADF;
132 return ((size_t) -1);
133 }
134
135 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
136 st->ustate = U0;
137 st->_errno = 0;
138 return ((size_t) 0);
139 }
140
141 st->_errno = 0; /* reset internal errno */
142 errno = 0; /* reset external errno */
143
144 /* a state machine for interpreting UTF8 code */
145 while (*inbytesleft > 0 && *outbytesleft > 0) {
146
147 uchar_t first_byte;
148 int uconv_num_internal = 0;
149
150 switch (st->ustate) {
151 case U0: /* assuming ASCII in the beginning */
152 /*
153 * Code converion for UCS-2LE to support Samba
154 */
155 if (st->little_endian) {
156 st->ustate = U1;
157 st->keepc[0] = **inbuf;
158 }
159 else if ((**inbuf & MSB) == 0) { /* ASCII */
160 **outbuf = **inbuf;
161 (*outbuf)++;
162 (*outbytesleft)--;
163 } else { /* Chinese character */
164 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode 0xc2..0xdf */
165
166 /* invalid sequence if the first char is either 0xc0 or 0xc1 */
167 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
168 st->_errno = errno = EILSEQ;
169 else {
170 st->ustate = U1;
171 st->keepc[0] = **inbuf;
172 }
173 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte 0xe0..0xef */
174 st->ustate = U2;
175 st->keepc[0] = **inbuf;
176 } else {
177 /* four bytes of UTF-8 sequences */
178 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
179 st->_errno = errno = EILSEQ;
180 else {
181 st->ustate = U5;
182 st->keepc[0] = **inbuf;
183 }
184 }
185 }
186 break;
187 case U1: /* 2 byte unicode */
188 if ((**inbuf & 0xc0) == MSB || st->little_endian) {
189 utf8_len = 2;
190 st->keepc[1] = **inbuf;
191
192 /*
193 * Code conversion for UCS-2LE to support Samba
194 */
195 if (st->little_endian) {
196 /*
197 * It's ASCII
198 */
199 if (st->keepc[1] == 0 && (st->keepc[0] & 0x80) == 0) {
200 *(*outbuf)++ = st->keepc[0];
201 (*outbytesleft)--;
202 st->ustate = U0;
203 break;
204 }
205
206 ucs = ((st->keepc[1] & 0xff) << 8) | (st->keepc[0] & 0xff);
207
208 } else
209 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
210
211 st->ustate = U4;
212#ifdef DEBUG
213 fprintf(stderr, "UTF8: %02x%02x --> ",
214 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
215#endif
216 continue; /* should not advance *inbuf */
217 } else {
218 st->_errno = errno = EILSEQ;
219 }
220 break;
221 case U2: /* 3 byte unicode - 2nd byte */
222
223 first_byte = st->keepc[0];
224
225 /* if the first byte is 0xed, it is illegal sequence if the second
226 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
227 */
228 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
229 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
230 st->_errno = errno = EILSEQ;
231 else {
232 st->ustate = U3;
233 st->keepc[1] = **inbuf;
234 }
235 break;
236 case U3: /* 3 byte unicode - 3rd byte */
237 if ((**inbuf & 0xc0) == MSB) {
238 st->ustate = U4;
239 utf8_len = 3;
240 st->keepc[2] = **inbuf;
241
242 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
243#ifdef DEBUG
244 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
245 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
246#endif
247 continue; /* should not advance *inbuf */
248 } else {
249 st->_errno = errno = EILSEQ;
250 }
251 break;
252 case U4:
253
254 n = get_big5_by_utf(ucs, &unidx, &big5code);
255
256 if ( n == -1 )
257 { /* unicode is either 0xfffe or 0xffff */
258 st->_errno = errno = EILSEQ;
259 break;
260 }
261
262/* comment the following lines out to ignore the non-Big5 characters
263 if (n != 0) { * legal unicode;illegal Big5 *
264 st->_errno = errno = EILSEQ;
265 break;
266 }
267*/
268
269 n = utf8_to_big5(unidx, big5code,
270 *outbuf, *outbytesleft, &uconv_num_internal);
271 if (n > 0) {
272 (*outbuf) += n;
273 (*outbytesleft) -= n;
274
275 uconv_num += uconv_num_internal;
276
277 st->ustate = U0;
278 } else {
279 st->_errno = errno = E2BIG;
280 }
281 break;
282 case U5:
283
284 first_byte = st->keepc[0];
285
286 /* if the first byte is 0xf0, it is illegal sequence if
287 * the second one is between 0x80 and 0x8f
288 * for Four-Byte UTF: U+10000..U+10FFFF
289 */
290 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
291 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
292 st->_errno = errno = EILSEQ;
293 else
294 {
295 st->ustate = U6;
296 st->keepc[1] = **inbuf;
297 }
298 break;
299 case U6:
300 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
301 {
302 st->ustate = U7;
303 st->keepc[2] = **inbuf;
304 }
305 else
306 st->_errno = errno = EILSEQ;
307 break;
308 case U7:
309 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
310 { /* replace with double NON_ID_CHARs */
311
312 utf8_len = 4;
313 st->keepc[3] = **inbuf;
314
315 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
316
317 st->ustate = U4;
318 continue;
319
320#if 0
321 if ( *outbytesleft < 2 )
322 st->_errno = errno = E2BIG;
323 else
324 {
325 **outbuf = NON_ID_CHAR;
326 *(*outbuf+1) = NON_ID_CHAR;
327 (*outbytesleft) -= 2;
328
329 uconv_num++;
330
331 st->ustate = U0;
332 }
333#endif
334 }
335 else
336 st->_errno = errno = EILSEQ;
337 break;
338 default: /* should never come here */
339 st->_errno = errno = EILSEQ;
340 st->ustate = U0; /* reset state */
341 break;
342 }
343
344 if (st->_errno) {
345#ifdef DEBUG
346 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
347 st->_errno, st->ustate);
348#endif
349 break;
350 }
351
352 (*inbuf)++;
353 (*inbytesleft)--;
354
355 }
356
357 if (*inbytesleft == 0 && st->ustate != U0)
358 errno = EINVAL;
359
360 if (*inbytesleft > 0 && *outbytesleft == 0)
361 errno = E2BIG;
362
363 if (errno) {
364 int num_reversed_bytes = 0;
365
366 switch (st->ustate)
367 {
368 case U1:
369 num_reversed_bytes = 1;
370 break;
371 case U2:
372 num_reversed_bytes = 1;
373 break;
374 case U3:
375 num_reversed_bytes = 2;
376 break;
377 case U4:
378 num_reversed_bytes = utf8_len - 1;
379 break;
380 case U5:
381 num_reversed_bytes = 1;
382 break;
383 case U6:
384 num_reversed_bytes = 2;
385 break;
386 case U7:
387 num_reversed_bytes = 3;
388 break;
389 }
390
391 /*
392 * if error, *inbuf points to the byte following the last byte
393 * successfully used in the conversion.
394 */
395 *inbuf -= num_reversed_bytes;
396 *inbytesleft += num_reversed_bytes;
397 st->ustate = U0;
398 return ((size_t) -1);
399 }
400
401 return uconv_num;
402}
403
404/*
405 * Match Big-5 code by UTF8 code;
406 * Return: = 0 - match from Unicode to Big-5 found
407 * = 1 - match from Unicode to Big-5 NOT found
408 * =-1 - illegal sequence
409 *
410 * Since binary search of the UTF8 to Big-5 table is necessary, might as well
411 * return index and Big-5 code matching to the unicode.
412 */
413static int get_big5_by_utf(uint_t ucs, int *unidx, unsigned long *big5code)
414{
415 /* 0xfffe and 0xffff should not be allowed */
416 if ( ucs == 0xFFFE || ucs == 0xFFFF ) return -1;
417
418 *unidx = binsearch(ucs, utf_big5_tab, MAX_BIG5_NUM);
419 if ((*unidx) >= 0)
420 *big5code = utf_big5_tab[*unidx].big5code;
421 else
422 return(1); /* match from UTF8 to Big-5 not found */
423#ifdef DEBUG
424 fprintf(stderr, "Unicode=%04x, idx=%5d, Big-5=%x ", ucs, *unidx, *big5code);
425#endif
426
427 return(0);
428}
429
430
431/*
432 * ISO/IEC 10646 (Unicode) --> Big-5
433 * Unicode --> UTF8 (FSS-UTF)
434 * (File System Safe Universal Character Set Transformation Format)
435 * Return: > 0 - converted with enough space in output buffer
436 * = 0 - no space in outbuf
437 */
438static int utf8_to_big5(int unidx, unsigned long big5code, char *buf, size_t buflen, int *uconv_num)
439{
440 unsigned long val; /* Big-5 value */
441 char c1, c2, big5_str[3];
442
443 if (buflen < 2) {
444 errno = E2BIG;
445 return(0);
446 }
447
448 if (unidx < 0) { /* no match from UTF8 to Big-5 */
449 *buf = *(buf+1) = NON_ID_CHAR;
450
451 /* non-identical conversion */
452 *uconv_num = 1;
453
454 } else {
455 val = big5code & 0xffff;
456 c1 = (char) ((val & 0xff00) >> 8);
457 c2 = (char) (val & 0xff);
458
459 *buf = big5_str[0] = c1;
460 *(buf+1) = big5_str[1] = c2;
Toomas Soomea7fb1da2019-01-28 09:59:47 +0200461 big5_str[2] = '\0';
Alexander Pyhalov16d86562018-11-21 12:34:20 +0300462 }
463
464#ifdef DEBUG
465 fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
466#endif
467
468 return(2);
469}
470
471
472/* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
473static int binsearch(unsigned long x, utf_big5 v[], int n)
474{
475 int low, high, mid;
476
477 low = 0;
478 high = n - 1;
479 while (low <= high) {
480 mid = (low + high) / 2;
481 if (x < v[mid].unicode)
482 high = mid - 1;
483 else if (x > v[mid].unicode)
484 low = mid + 1;
485 else /* found match */
486 return mid;
487 }
488 return (-1); /* no match */
489}