blob: 12b3713ea2af670a61108d1ca5f0f9cbd82c3857 [file] [log] [blame]
Alexander Pyhalov16d86562018-11-21 12:34:20 +03001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1995, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27#include <stdio.h>
28#include <stdlib.h>
29#include <errno.h>
30#include "cns11643_unicode_TW.h" /* CNS 11643 to UTF8 mapping table */
31
32#define MSB 0x80 /* most significant bit */
33#define MBYTE 0x8e /* multi-byte (4 byte character) */
34#define PMASK 0xa0 /* plane number mask */
35#define ONEBYTE 0xff /* right most byte */
36#define MSB_OFF 0x7f /* mask off MBS */
37
38#define SI 0x0f /* shift in */
39#define SO 0x0e /* shift out */
40#define ESC 0x1b /* escape */
41
42/*
43 * static const char plane_char[] = "0GH23456789:;<=>?";
44 * static const char plane_char[] = "0GHIJKLMNOPQRSTUV";
45 * #define GET_PLANEC(i) (plane_char[i])
46 */
47
48/* non-identified character */
49#define UTF8_NON_ID_CHAR1 0xEF
50#define UTF8_NON_ID_CHAR2 0xBF
51#define UTF8_NON_ID_CHAR3 0xBD
52
53typedef struct _icv_state {
54 char keepc[4]; /* maximum # byte of CNS11643 code */
55 short cstate; /* state machine id */
56 int plane_no; /* plane number for Chinese character */
57 int _errno; /* internal errno */
58} _iconv_st;
59
60enum _CSTATE { C0, C1, C2, C3, C4, C5, C6, C7 };
61
62
63static int get_plane_no_by_iso(const char);
64static int iso_to_utf8(int, char[], char*, size_t);
65static int binsearch(unsigned long, cns_utf[], int);
66
67
68/*
69 * Open; called from iconv_open()
70 */
71void *
72_icv_open()
73{
74 _iconv_st *st;
75
76 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
77 errno = ENOMEM;
78 return ((void *) -1);
79 }
80
81 st->cstate = C0;
82 st->plane_no = 0;
83 st->_errno = 0;
84
85 return ((void *) st);
86}
87
88
89/*
90 * Close; called from iconv_close()
91 */
92void
93_icv_close(_iconv_st *st)
94{
95 if (!st)
96 errno = EBADF;
97 else
98 free(st);
99}
100
101
102/*
103 * Actual conversion; called from iconv()
104 */
105/*=========================================================================
106 *
107 * State Machine for interpreting ISO 2022-7 code
108 *
109 *=========================================================================
110 *
111 * plane 2 - 16
112 * +---------->-------+
113 * plane ^ |
114 * ESC $ ) number SO | plane 1 v
115 * +-> C0 ----> C1 ---> C2 ---> C3 ------> C4 --> C5 -------> C6 C7
116 * | | ascii | ascii | ascii | ascii | SI | | | |
117 * +----------------------------+ <-----+------+ +------<---+------+
118 * ^ |
119 * | ascii v
120 * +---------<-------------<---------+
121 *
122 *=========================================================================*/
123size_t
124_icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
125 char **outbuf, size_t *outbytesleft)
126{
127 int n;
128
129#ifdef DEBUG
130 fprintf(stderr, "========== iconv(): ISO2022-7 --> UTF2 ==========\n");
131#endif
132 if (st == NULL) {
133 errno = EBADF;
134 return ((size_t) -1);
135 }
136
137 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
138 st->cstate = C0;
139 st->_errno = 0;
140 return ((size_t) 0);
141 }
142
143 st->_errno = 0; /* reset internal errno */
144 errno = 0; /* reset external errno */
145
146 /* a state machine for interpreting ISO 2022-7 code */
147 while (*inbytesleft > 0 && *outbytesleft > 0) {
148 switch (st->cstate) {
149 case C0: /* assuming ASCII in the beginning */
150 if (**inbuf == ESC) {
151 st->cstate = C1;
152 } else { /* real ASCII */
153 **outbuf = **inbuf;
154 (*outbuf)++;
155 (*outbytesleft)--;
156 }
157 break;
158 case C1: /* got ESC, expecting $ */
159 if (**inbuf == '$') {
160 st->cstate = C2;
161 } else {
162 **outbuf = ESC;
163 (*outbuf)++;
164 (*outbytesleft)--;
165 st->cstate = C0;
166 st->_errno = 0;
167 continue; /* don't advance inbuf */
168 }
169 break;
170 case C2: /* got $, expecting ) */
171 if (**inbuf == ')') {
172 st->cstate = C3;
173 } else {
174 if (*outbytesleft < 2) {
175 st->_errno = errno = E2BIG;
176 return((size_t)-1);
177 }
178 **outbuf = ESC;
179 *(*outbuf+1) = '$';
180 (*outbuf) += 2;
181 (*outbytesleft) -= 2;
182 st->cstate = C0;
183 st->_errno = 0;
184 continue; /* don't advance inbuf */
185 }
186 break;
187 case C3: /* got ) expecting G,H,I,...,V */
188 st->plane_no = get_plane_no_by_iso(**inbuf);
189 if (st->plane_no > 0 ) { /* plane #1 - #16 */
190 st->cstate = C4;
191 } else {
192 if (*outbytesleft < 3) {
193 st->_errno = errno = E2BIG;
194 return((size_t)-1);
195 }
196 **outbuf = ESC;
197 *(*outbuf+1) = '$';
198 *(*outbuf+2) = ')';
199 (*outbuf) += 3;
200 (*outbytesleft) -= 3;
201 st->cstate = C0;
202 st->_errno = 0;
203 continue; /* don't advance inbuf */
204 }
205 break;
206 case C4: /* SI (Shift In) */
207 if (**inbuf == ESC) {
208 st->cstate = C1;
209 break;
210 }
211 if (**inbuf == SO) {
212#ifdef DEBUG
213 fprintf(stderr, "<-------------- SO -------------->\n");
214#endif
215 st->cstate = C5;
216 } else { /* ASCII */
217 **outbuf = **inbuf;
218 (*outbuf)++;
219 (*outbytesleft)--;
220 st->cstate = C0;
221 st->_errno = 0;
222 }
223 break;
224 case C5: /* SO (Shift Out) */
225 if (**inbuf == SI) {
226#ifdef DEBUG
227 fprintf(stderr, ">-------------- SI --------------<\n");
228#endif
229 st->cstate = C4;
230 } else { /* 1st Chinese character */
231 if (st->plane_no == 1) {
232 st->keepc[0] = (char) (**inbuf | MSB);
233 st->cstate = C6;
234 } else { /* plane #1 - #16 */
235 st->keepc[0] = (char) MBYTE;
236 st->keepc[1] = (char) (PMASK +
237 st->plane_no);
238 st->keepc[2] = (char) (**inbuf | MSB);
239 st->cstate = C7;
240 }
241 }
242 break;
243 case C6: /* plane #1: 2nd Chinese character */
244 st->keepc[1] = (char) (**inbuf | MSB);
Toomas Soomea7fb1da2019-01-28 09:59:47 +0200245 st->keepc[2] = st->keepc[3] = '\0';
Alexander Pyhalov16d86562018-11-21 12:34:20 +0300246 n = iso_to_utf8(1, st->keepc, *outbuf,
247 *outbytesleft);
248 if (n > 0) {
249 (*outbuf) += n;
250 (*outbytesleft) -= n;
251 } else {
252 st->_errno = errno;
253 return((size_t)-1);
254 }
255 st->cstate = C5;
256 break;
257 case C7: /* 4th Chinese character */
258 st->keepc[3] = (char) (**inbuf | MSB);
259 n = iso_to_utf8(st->plane_no, st->keepc, *outbuf,
260 *outbytesleft);
261 if (n > 0) {
262 (*outbuf) += n;
263 (*outbytesleft) -= n;
264 } else {
265 st->_errno = errno;
266 return((size_t)-1);
267 }
268 st->cstate = C5;
269 break;
270 default: /* should never come here */
271 st->_errno = errno = EILSEQ;
272 st->cstate = C0; /* reset state */
273 break;
274 }
275
276 (*inbuf)++;
277 (*inbytesleft)--;
278
279 if (st->_errno) {
280#ifdef DEBUG
281 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\tinbuf=%x\n",
282 st->_errno, st->cstate, **inbuf);
283#endif
284 break;
285 }
286 if (errno)
287 return((size_t)-1);
288 }
289
290 if (*inbytesleft > 0 && *outbytesleft == 0) {
291 errno = E2BIG;
292 return((size_t)-1);
293 }
294 return (*inbytesleft);
295}
296
297
298/*
299 * Get plane number by ISO plane char; i.e. 'G' returns 1, 'H' returns 2, etc.
300 * Returns -1 on error conditions
301 */
302static int get_plane_no_by_iso(const char inbuf)
303{
304 int ret;
305 unsigned char uc = (unsigned char) inbuf;
306
307 if (uc == '0') /* plane #0 */
308 return(0);
309
310 ret = uc - 'F';
311 switch (ret) {
312 case 1: /* 0x8EA1 - G */
313 case 2: /* 0x8EA2 - H */
314 case 3: /* 0x8EA3 - I */
315 case 4: /* 0x8EA4 - J */
316 case 5: /* 0x8EA5 - K */
317 case 6: /* 0x8EA6 - L */
318 case 7: /* 0x8EA7 - M */
319 case 8: /* 0x8EA8 - N */
320 case 9: /* 0x8EA9 - O */
321 case 10: /* 0x8EAA - P */
322 case 11: /* 0x8EAB - Q */
323 case 12: /* 0x8EAC - R */
324 case 13: /* 0x8EAD - S */
325 case 14: /* 0x8EAE - T */
326 case 15: /* 0x8EAF - U */
327 case 16: /* 0x8EB0 - V */
328 return (ret);
329 default:
330 return (-1);
331 }
332}
333
334
335/*
336 * ISO 2022-7 code --> ISO/IEC 10646 (Unicode)
337 * Unicode --> UTF8 (FSS-UTF)
338 * (File System Safe Universal Character Set Transformation Format)
339 * Return: > 0 - converted with enough space in output buffer
340 * = 0 - no space in outbuf
341 */
342static int iso_to_utf8(int plane_no, char keepc[], char *buf, size_t buflen)
343{
344 char iso_str[3];
345 unsigned long iso_val; /* ISO 2022-7 value */
346 int unidx; /* Unicode index */
347 unsigned long uni_val; /* Unicode */
348
349#ifdef DEBUG
350 fprintf(stderr, "%s %d ", keepc, plane_no);
351#endif
352 if (plane_no == 1) {
353 iso_str[0] = keepc[0] & MSB_OFF;
354 iso_str[1] = keepc[1] & MSB_OFF;
355 } else {
356 iso_str[0] = keepc[2] & MSB_OFF;
357 iso_str[1] = keepc[3] & MSB_OFF;
358 }
359 iso_val = (iso_str[0] << 8) + iso_str[1];
360#ifdef DEBUG
361 fprintf(stderr, "%x\t", iso_val);
362#endif
363
364 switch (plane_no) {
365 case 1:
366 unidx = binsearch(iso_val, cns1_utf_tab, MAX_CNS1_NUM);
367 if (unidx >= 0)
368 uni_val = cns1_utf_tab[unidx].unicode;
369 break;
370 case 2:
371 unidx = binsearch(iso_val, cns2_utf_tab, MAX_CNS2_NUM);
372 if (unidx >= 0)
373 uni_val = cns2_utf_tab[unidx].unicode;
374 break;
375 case 3:
376 case 14:
377 unidx = binsearch(iso_val, cns3_utf_tab, MAX_CNS3_NUM);
378 if (unidx >= 0)
379 uni_val = cns3_utf_tab[unidx].unicode;
380 break;
381 default:
382 unidx = -1; /* no mapping from CNS to UTF8 */
383 break;
384 }
385
386#ifdef DEBUG
387 fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val);
388#endif
389
390 if (unidx >= 0) { /* do Unicode to UTF8 conversion */
391 if (uni_val > 0x0080 && uni_val <= 0x07ff) {
392 if (buflen < 2) {
393 errno = E2BIG;
394 return(0);
395 }
396 *buf = (char)((uni_val >> 6) & 0x1f) | 0xc0;
397 *(buf+1) = (char)(uni_val & 0x3f) | 0x80;
398#ifdef DEBUG
399 fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE);
400#endif
401 return(2);
402 }
403 if (uni_val > 0x0800 && uni_val <= 0xffff) {
404 if (buflen < 3) {
405 errno = E2BIG;
406 return(0);
407 }
408 *buf = (char)((uni_val >> 12) & 0xf) | 0xe0;
409 *(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80;
410 *(buf+2) = (char)(uni_val & 0x3f) | 0x80;
411#ifdef DEBUG
412 fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE);
413#endif
414 return(3);
415 }
416 }
417
418 /* can't find a match in CNS --> UTF8 table or illegal UTF8 code */
419 if (buflen < 3) {
420 errno = E2BIG;
421 return(0);
422 }
423
424 *(unsigned char*) buf = UTF8_NON_ID_CHAR1;
425 *(unsigned char*) (buf+1) = UTF8_NON_ID_CHAR2;
426 *(unsigned char*) (buf+2) = UTF8_NON_ID_CHAR3;
427
428#ifdef DEBUG
429 fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2));
430#endif
431 return(3);
432}
433
434
435/* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
436static int binsearch(unsigned long x, cns_utf v[], int n)
437{
438 int low, high, mid;
439
440 low = 0;
441 high = n - 1;
442 while (low <= high) {
443 mid = (low + high) / 2;
444 if (x < v[mid].cnscode)
445 high = mid - 1;
446 else if (x > v[mid].cnscode)
447 low = mid + 1;
448 else /* found match */
449 return mid;
450 }
451 return (-1); /* no match */
452}