blob: b397a2856f32df01b5f13b0b5d54b571a29680d6 [file] [log] [blame]
Alexander Pyhalov16d86562018-11-21 12:34:20 +03001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2000, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27#include <stdio.h>
28#include <stdlib.h>
29#include <errno.h>
30#include <sys/types.h>
31#include <sys/isa_defs.h>
32#include "unicode_big5hk.h" /* UTF8 to HKSCS mapping table */
33#include "common_defs.h"
34
35#define MSB 0x80 /* most significant bit */
36#define ONEBYTE 0xff /* right most byte */
37
38#define NON_ID_CHAR '?' /* non-identified character */
39
40typedef struct _icv_state {
41 char keepc[6]; /* maximum # byte of UTF8 code */
42 short ustate;
43 int _errno; /* internal errno */
44 boolean little_endian;
45 boolean bom_written;
46} _iconv_st;
47
48enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 };
49
50static int get_hkscs_by_utf(uint_t, int *, unsigned long *);
51static int utf8_to_hkscs(int, unsigned long, char *, size_t, int *);
52static int binsearch(unsigned long, utf_hkscs[], int);
53
54/*
55 * Open; called from iconv_open()
56 */
57void *
58_icv_open()
59{
60 _iconv_st *st;
61
62 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
63 errno = ENOMEM;
64 return ((void *) -1);
65 }
66
67 st->ustate = U0;
68 st->_errno = 0;
69 st->little_endian = false;
70 st->bom_written = false;
71#if defined(UCS_2LE)
72 st->little_endian = true;
73 st->bom_written = true;
74#endif
75 return ((void *) st);
76}
77
78
79/*
80 * Close; called from iconv_close()
81 */
82void
83_icv_close(_iconv_st *st)
84{
85 if (!st)
86 errno = EBADF;
87 else
88 free(st);
89}
90
91
92/*
93 * Actual conversion; called from iconv()
94 */
95/*=========================================================
96 *
97 * State Machine for interpreting UTF8 code
98 *
99 *=========================================================
100 * 2nd byte 3rd byte 4th byte
101 * +----->------->------->U5---->U6------>U7
102 * | |
103 * | 3 byte unicode |
104 * +----->------->-------+ |
105 * | | |
106 * ^ v |
107 * | 2 byte U2 ---> U3 |
108 * | unicode v v
109 * +------> U0 -------> U1 +-------->U4---+
110 * ^ ascii | | ^ |
111 * | | +-------->--------->--------+ |
112 * | v v
113 * +----<---+-----<------------<------------<------------+
114 *
115 *=========================================================*/
116size_t
117_icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
118 char **outbuf, size_t *outbytesleft)
119{
120 int utf8_len = 0;
121 int n, unidx;
122 unsigned long hkscscode;
123 int uconv_num = 0;
124 uint_t ucs;
125
126#ifdef DEBUG
127 fprintf(stderr, "========== iconv(): UTF2 --> HKSCS ==========\n");
128#endif
129 if (st == NULL) {
130 errno = EBADF;
131 return ((size_t) -1);
132 }
133
134 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
135 st->ustate = U0;
136 st->_errno = 0;
137 return ((size_t) 0);
138 }
139
140 st->_errno = 0; /* reset internal errno */
141 errno = 0; /* reset external errno */
142
143 /* a state machine for interpreting UTF8 code */
144 while (*inbytesleft > 0 && *outbytesleft > 0) {
145
146 uchar_t first_byte;
147 int uconv_num_internal = 0;
148
149 switch (st->ustate) {
150 case U0: /* assuming ASCII in the beginning */
151 /*
152 * Code converion for UCS-2LE to support Samba
153 */
154 if (st->little_endian) {
155 st->ustate = U1;
156 st->keepc[0] = **inbuf;
157 }
158 else if ((**inbuf & MSB) == 0) { /* ASCII */
159 **outbuf = **inbuf;
160 (*outbuf)++;
161 (*outbytesleft)--;
162 } else { /* Chinese character */
163 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode 0xc2..0xdf */
164
165 /* invalid sequence if the first byte is either 0xc0 or 0xc1 */
166 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
167 st->_errno = errno = EILSEQ;
168 else {
169 st->ustate = U1;
170 st->keepc[0] = **inbuf;
171 }
172 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte 0xe0..0xef */
173 st->ustate = U2;
174 st->keepc[0] = **inbuf;
175 } else {
176 /* four bytes of UTF-8 sequences */
177 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
178 st->_errno = errno = EILSEQ;
179 else
180 {
181 st->ustate = U5;
182 st->keepc[0] = **inbuf;
183 }
184 }
185 }
186 break;
187 case U1: /* 2 byte unicode */
188 if ((**inbuf & 0xc0) == MSB || st->little_endian) {
189 st->keepc[1] = **inbuf;
190 utf8_len = 2;
191
192 /*
193 * Code conversion for UCS-2LE to support Samba
194 */
195 if (st->little_endian) {
196 /*
197 * It's ASCII
198 */
199 if (st->keepc[1] == 0 && (st->keepc[0] & 0x80) == 0) {
200 *(*outbuf)++ = st->keepc[0];
201 (*outbytesleft)--;
202 st->ustate = U0;
203 break;
204 }
205
206 ucs = ((st->keepc[1] & 0xff) << 8) | ( st->keepc[0] & 0xff);
207
208 } else
209 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
210
211 st->ustate = U4;
212#ifdef DEBUG
213 fprintf(stderr, "UTF8: %02x%02x --> ",
214 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
215#endif
216 continue; /* should not advance *inbuf */
217 } else {
218 st->_errno = errno = EILSEQ;
219 }
220 break;
221 case U2: /* 3 byte unicode - 2nd byte */
222
223 first_byte = st->keepc[0];
224
225 /* if the first byte is 0xed, it is illegal sequence if the second
226 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
227 */
228 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
229 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
230 st->_errno = errno = EILSEQ;
231 else {
232 st->ustate = U3;
233 st->keepc[1] = **inbuf;
234 }
235 break;
236 case U3: /* 3 byte unicode - 3rd byte */
237 if ((**inbuf & 0xc0) == MSB) {
238 st->ustate = U4;
239 st->keepc[2] = **inbuf;
240 utf8_len = 3;
241
242 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
243#ifdef DEBUG
244 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
245 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
246#endif
247 continue; /* should not advance *inbuf */
248 } else {
249 st->_errno = errno = EILSEQ;
250 }
251 break;
252 case U4:
253 n = get_hkscs_by_utf(ucs, &unidx, &hkscscode);
254 if ( n == -1 ) { /* unicode is either 0xfffe or 0xffff */
255 st->_errno = errno = EILSEQ;
256 break;
257 }
258
259/* comment the following lines out to ignore the non-Big5 characters
260g if (n != 0) {
261 st->_errno = errno = EILSEQ;
262 break;
263 }
264*/
265
266 n = utf8_to_hkscs(unidx, hkscscode,
267 *outbuf, *outbytesleft, &uconv_num_internal);
268 if (n > 0) {
269 (*outbuf) += n;
270 (*outbytesleft) -= n;
271
272 uconv_num += uconv_num_internal;
273
274 st->ustate = U0;
275 } else {
276 st->_errno = errno;
277 }
278 break;
279 case U5:
280
281 first_byte = st->keepc[0];
282
283 /* if the first byte is 0xf0, it is illegal sequence if
284 * the second one is between 0x80 and 0x8f
285 * for Four-Byte UTF: U+10000..U+10FFFF
286 */
287 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
288 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
289 st->_errno = errno = EILSEQ;
290 else
291 {
292 st->ustate = U6;
293 st->keepc[1] = **inbuf;
294 }
295 break;
296 case U6:
297 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
298 {
299 st->ustate = U7;
300 st->keepc[2] = **inbuf;
301 }
302 else
303 st->_errno = errno = EILSEQ;
304 break;
305 case U7:
306 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
307 {
308 utf8_len = 4;
309 st->keepc[3] = **inbuf;
310
311 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
312
313 st->ustate = U4;
314 continue; /* should not advance *inbuf */
315 }
316 else
317 st->_errno = errno = EILSEQ;
318 break;
319 default: /* should never come here */
320 st->_errno = errno = EILSEQ;
321 st->ustate = U0; /* reset state */
322 break;
323 }
324
325 if (st->_errno) {
326#ifdef DEBUG
327 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
328 st->_errno, st->ustate);
329#endif
330 break;
331 }
332
333 (*inbuf)++;
334 (*inbytesleft)--;
335 }
336
337 if (errno) return ((size_t) -1);
338
339 if (*inbytesleft == 0 && st->ustate != U0)
340 errno = EINVAL;
341
342 if (*inbytesleft > 0 && *outbytesleft == 0)
343 errno = E2BIG;
344
345 if (errno) {
346 int num_reversed_bytes = 0;
347
348 switch (st->ustate)
349 {
350 case U1:
351 num_reversed_bytes = 1;
352 break;
353 case U2:
354 num_reversed_bytes = 1;
355 break;
356 case U3:
357 num_reversed_bytes = 2;
358 break;
359 case U4:
360 num_reversed_bytes = utf8_len - 1;
361 break;
362 case U5:
363 num_reversed_bytes = 1;
364 break;
365 case U6:
366 num_reversed_bytes = 2;
367 break;
368 case U7:
369 num_reversed_bytes = 3;
370 break;
371 }
372
373 /*
374 * if error, *inbuf points to the byte following the last byte
375 * successfully used in the conversion.
376 */
377 *inbuf -= num_reversed_bytes;
378 *inbytesleft += num_reversed_bytes;
379 st->ustate = U0;
380 return ((size_t) -1);
381 }
382
383 return uconv_num;
384}
385
386/*
387 * Match HKSCS code by UTF8 code;
388 * Return: = 0 - match from Unicode to HKSCS found
389 * = 1 - match from Unicode to HKSCS NOT found
390 * =-1 - illegal sequence
391 *
392 * Since binary search of the UTF8 to HKSCS table is necessary, might as well
393 * return index and HKSCS code matching to the unicode.
394 */
395static int get_hkscs_by_utf(uint_t unicode, int *unidx, unsigned long *hkscscode)
396{
397 /* the 0xFFFE and 0xFFFF should not be allowed */
398 if (unicode == 0xFFFE || unicode == 0xFFFF ) return -1;
399
400 *unidx = binsearch(unicode, utf_hkscs_tab, MAX_HKSCS_NUM);
401 if ((*unidx) >= 0)
402 *hkscscode = utf_hkscs_tab[*unidx].hkscscode;
403 else
404 return(1); /* match from UTF8 to HKSCS not found */
405#ifdef DEBUG
406 fprintf(stderr, "Unicode=%04x, idx=%5d, HKSCS=%x ", unicode, *unidx, *hkscscode);
407#endif
408
409 return(0);
410}
411
412
413/*
414 * ISO/IEC 10646 (Unicode) --> HKSCS
415 * Unicode --> UTF8 (FSS-UTF)
416 * (File System Safe Universal Character Set Transformation Format)
417 * Return: > 0 - converted with enough space in output buffer
418 * = 0 - no space in outbuf
419 */
420static int utf8_to_hkscs(int unidx, unsigned long hkscscode, char *buf, size_t buflen, int *uconv_num)
421{
422 unsigned long val; /* HKSCS value */
423 char c1, c2, hkscs_str[3];
424
425 if (buflen < 2) {
426 errno = E2BIG;
427 return(0);
428 }
429
430 if (unidx < 0) { /* no match from UTF8 to HKSCS */
431 *buf = *(buf+1) = NON_ID_CHAR;
432
433 /* non-identical conversion */
434 *uconv_num = 1;
435 } else {
436 val = hkscscode & 0xffff;
437 c1 = (char) ((val & 0xff00) >> 8);
438 c2 = (char) (val & 0xff);
439
440 *buf = hkscs_str[0] = c1;
441 *(buf+1) = hkscs_str[1] = c2;
Toomas Soomea7fb1da2019-01-28 09:59:47 +0200442 hkscs_str[2] = '\0';
Alexander Pyhalov16d86562018-11-21 12:34:20 +0300443 }
444
445#ifdef DEBUG
446 fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
447#endif
448
449 return(2);
450}
451
452
453/* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
454static int binsearch(unsigned long x, utf_hkscs v[], int n)
455{
456 int low, high, mid;
457
458 low = 0;
459 high = n - 1;
460 while (low <= high) {
461 mid = (low + high) / 2;
462 if (x < v[mid].unicode)
463 high = mid - 1;
464 else if (x > v[mid].unicode)
465 low = mid + 1;
466 else /* found match */
467 return mid;
468 }
469 return (-1); /* no match */
470}