blob: 254616c294f28f4bb227c7094fc67cd38ce0131f [file] [log] [blame]
Alexander Pyhalov16d86562018-11-21 12:34:20 +03001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1995, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27#include <stdio.h>
28#include <stdlib.h>
29#include <sys/types.h>
30#include <errno.h>
31#include "unicode_big5p.h" /* UTF8 to Big-5 Plus mapping table */
32#include "common_defs.h"
33
34#define MSB 0x80 /* most significant bit */
35#define ONEBYTE 0xff /* right most byte */
36
37#define NON_ID_CHAR '?' /* non-identified character */
38
39typedef struct _icv_state {
40 char keepc[6]; /* maximum # byte of UTF8 code */
41 short ustate;
42 int _errno; /* internal errno */
43} _iconv_st;
44
45enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 };
46
47static int get_big5p_by_utf(char, char, int *, unsigned long *);
48static int utf8_to_big5p(int, unsigned long, char *, size_t);
49static int binsearch(unsigned long, utf_big5p[], int);
50
51
52/*
53 * Open; called from iconv_open()
54 */
55void *
56_icv_open()
57{
58 _iconv_st *st;
59
60 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
61 errno = ENOMEM;
62 return ((void *) -1);
63 }
64
65 st->ustate = U0;
66 st->_errno = 0;
67
68 return ((void *) st);
69}
70
71
72/*
73 * Close; called from iconv_close()
74 */
75void
76_icv_close(_iconv_st *st)
77{
78 if (!st)
79 errno = EBADF;
80 else
81 free(st);
82}
83
84
85/*
86 * Actual conversion; called from iconv()
87 */
88/*=========================================================
89 *
90 * State Machine for interpreting UTF8 code
91 *
92 *=========================================================
93 *
94 * 2nd byte 3rd byte 4th byte
95 * +----->------->------->U5------>U6--------->U7
96 * | |
97 * | 3 byte unicode |
98 * +----->------->-------+ |
99 * | | |
100 * ^ v |
101 * | 2 byte U2 ---> U3 |
102 * | unicode v |
103 * +------> U0 -------> U1 +-------->U4---+
104 * ^ ascii | | ^ |
105 * | | +-------->--------->--------+ |
106 * | v v
107 * +----<---+-----<------------<------------<------------+
108 *
109 *=========================================================*/
110size_t
111_icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
112 char **outbuf, size_t *outbytesleft)
113{
114 char c1 = '\0', c2 = '\0';
115 int n, unidx;
116 unsigned long big5pcode;
117
118#ifdef DEBUG
119 fprintf(stderr, "========== iconv(): UTF2 --> Big-5 Plus ==========\n");
120#endif
121 if (st == NULL) {
122 errno = EBADF;
123 return ((size_t) -1);
124 }
125
126 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
127 st->ustate = U0;
128 st->_errno = 0;
129 return ((size_t) 0);
130 }
131
132 st->_errno = 0; /* reset internal errno */
133 errno = 0; /* reset external errno */
134
135 /* a state machine for interpreting UTF8 code */
136 while (*inbytesleft > 0 && *outbytesleft > 0) {
137
138 uchar_t first_byte;
139
140 switch (st->ustate) {
141 case U0: /* assuming ASCII in the beginning */
142 if ((**inbuf & MSB) == 0) { /* ASCII */
143 **outbuf = **inbuf;
144 (*outbuf)++;
145 (*outbytesleft)--;
146 } else { /* Chinese character */
147 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode 0xc2..0xdf */
148
149 /* invalid sequence if the first char is either 0xc0 or 0xc1 */
150 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
151 st->_errno = errno = EILSEQ;
152 else {
153 st->ustate = U1;
154 st->keepc[0] = **inbuf;
155 }
156 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte 0xe0..0xef */
157 st->ustate = U2;
158 st->keepc[0] = **inbuf;
159 } else {
160 /* currently the 16 planes are supported */
161 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
162 st->_errno = errno = EILSEQ;
163 else
164 {
165 st->ustate = U5;
166 st->keepc[0] = **inbuf;
167 }
168 }
169 }
170 break;
171 case U1: /* 2 byte unicode */
172 if ((**inbuf & 0xc0) == MSB) {
173 st->ustate = U4;
174 st->keepc[1] = **inbuf;
175 c1 = (st->keepc[0]&0x1c)>>2;
176 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
177#ifdef DEBUG
178 fprintf(stderr, "UTF8: %02x%02x --> ",
179 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
180#endif
181 continue; /* should not advance *inbuf */
182 } else {
183 st->_errno = errno = EILSEQ;
184 }
185 break;
186 case U2: /* 3 byte unicode - 2nd byte */
187
188 first_byte = st->keepc[0];
189
190 /* if the first byte is 0xed, it is illegal sequence if the second
191 * one is between 0xa0 and 0xbf because the surrogate section is ill-formed
192 */
193 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
194 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
195 st->_errno = errno = EILSEQ;
196 else {
197 st->ustate = U3;
198 st->keepc[1] = **inbuf;
199 }
200 break;
201 case U3: /* 3 byte unicode - 3rd byte */
202 if ((**inbuf & 0xc0) == MSB) {
203 st->ustate = U4;
204 st->keepc[2] = **inbuf;
205 c1 = ((st->keepc[0]&0x0f)<<4) |
206 ((st->keepc[1]&0x3c)>>2);
207 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
208#ifdef DEBUG
209 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
210 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
211#endif
212 continue; /* should not advance *inbuf */
213 } else {
214 st->_errno = errno = EILSEQ;
215 }
216 break;
217 case U4:
218 n = get_big5p_by_utf(c1, c2, &unidx, &big5pcode);
219 if ( n == -1 ) { /* unicode is either 0xfffe or 0xffff */
220 st->_errno = errno = EILSEQ;
221 break;
222 }
223
224/* comment the following lines to ignore no Big5 plus characters
225 if (n != 0) {
226 st->_errno = errno = EILSEQ;
227 break;
228 }
229*/
230
231 n = utf8_to_big5p(unidx, big5pcode,
232 *outbuf, *outbytesleft);
233 if (n > 0) {
234 (*outbuf) += n;
235 (*outbytesleft) -= n;
236
237 st->ustate = U0;
238 } else {
239 st->_errno = errno = E2BIG;
240 }
241 break;
242 case U5:
243 first_byte = st->keepc[0];
244
245 /* if the first byte is 0xf0, it is illegal sequence if
246 * the second one is between 0x80 and 0x8f
247 * for Four-Byte UTF: U+10000..U+10FFFF
248 */
249 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
250 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
251 st->_errno = errno = EILSEQ;
252 else
253 {
254 st->ustate = U6;
255 st->keepc[1] = **inbuf;
256 }
257 break;
258 case U6:
259 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
260 {
261 st->ustate = U7;
262 st->keepc[2] = **inbuf;
263 }
264 else
265 st->_errno = errno = EILSEQ;
266 break;
267 case U7:
268 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
269 { /* skip it */
270 st->ustate = U0;
271 }
272 else
273 st->_errno = errno = EILSEQ;
274 break;
275 default: /* should never come here */
276 st->_errno = errno = EILSEQ;
277 st->ustate = U0; /* reset state */
278 break;
279 }
280
281 if (st->_errno) {
282#ifdef DEBUG
283 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
284 st->_errno, st->ustate);
285#endif
286 break;
287 }
288
289 (*inbuf)++;
290 (*inbytesleft)--;
291 }
292
293 if (errno) return ((size_t) -1);
294
295 if (*inbytesleft == 0 && st->ustate != U0) {
296 errno = EINVAL;
297 return ((size_t) -1);
298 }
299
300 if (*inbytesleft > 0 && *outbytesleft == 0) {
301 errno = E2BIG;
302 return((size_t) -1);
303 }
304 return (*inbytesleft);
305}
306
307
308/*
309 * Match Big-5 Plus code by UTF8 code;
310 * Return: = 0 - match from Unicode to Big-5 Plus found
311 * = 1 - match from Unicode to Big-5 Plus NOT found
312 * =-1 - illegal sequence
313 *
314 * Since binary search of the UTF8 to Big-5 Plus table is necessary, might as well
315 * return index and Big-5 Plus code matching to the unicode.
316 */
317static int get_big5p_by_utf(char c1, char c2, int *unidx, unsigned long *big5pcode)
318{
319 unsigned long unicode;
320
321 unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
322 /* 0xfffe and 0xffff should not be allowed */
323 if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1;
324
325 *unidx = binsearch(unicode, utf_big5p_tab, MAX_BIG5P_NUM);
326 if ((*unidx) >= 0)
327 *big5pcode = utf_big5p_tab[*unidx].big5pcode;
328 else
329 return(1); /* match from UTF8 to Big-5 Plus not found */
330#ifdef DEBUG
331 fprintf(stderr, "Unicode=%04x, idx=%5d, Big-5 Plus=%x ", unicode, *unidx, *big5pcode);
332#endif
333
334 return(0);
335}
336
337
338/*
339 * ISO/IEC 10646 (Unicode) --> Big-5 Plus
340 * Unicode --> UTF8 (FSS-UTF)
341 * (File System Safe Universal Character Set Transformation Format)
342 * Return: > 0 - converted with enough space in output buffer
343 * = 0 - no space in outbuf
344 */
345static int utf8_to_big5p(int unidx, unsigned long big5pcode, char *buf, size_t buflen)
346{
347 unsigned long val; /* Big-5 Plus value */
348 char c1, c2, big5p_str[3];
349
350 if (buflen < 2) {
351 errno = E2BIG;
352 return(0);
353 }
354
355 if (unidx < 0) { /* no match from UTF8 to Big-5 Plus */
356 *buf = *(buf+1) = NON_ID_CHAR;
357 } else {
358 val = big5pcode & 0xffff;
359 c1 = (char) ((val & 0xff00) >> 8);
360 c2 = (char) (val & 0xff);
361
362 *buf = big5p_str[0] = c1;
363 *(buf+1) = big5p_str[1] = c2;
Toomas Soomea7fb1da2019-01-28 09:59:47 +0200364 big5p_str[2] = '\0';
Alexander Pyhalov16d86562018-11-21 12:34:20 +0300365 }
366
367#ifdef DEBUG
368 fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
369#endif
370
371 return(2);
372}
373
374
375/* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
376static int binsearch(unsigned long x, utf_big5p v[], int n)
377{
378 int low, high, mid;
379
380 low = 0;
381 high = n - 1;
382 while (low <= high) {
383 mid = (low + high) / 2;
384 if (x < v[mid].unicode)
385 high = mid - 1;
386 else if (x > v[mid].unicode)
387 low = mid + 1;
388 else /* found match */
389 return mid;
390 }
391 return (-1); /* no match */
392}