OpenWrt – Blame information for rev 1
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
1 | office | 1 | #include <iconv.h> |
2 | #include <errno.h> |
||
3 | #include <wchar.h> |
||
4 | #include <string.h> |
||
5 | #include <strings.h> |
||
6 | #include <stdlib.h> |
||
7 | #include <limits.h> |
||
8 | |||
9 | #include <dirent.h> |
||
10 | #include <fcntl.h> |
||
11 | #include <sys/mman.h> |
||
12 | #include <sys/stat.h> |
||
13 | #include <unistd.h> |
||
14 | #include <stdint.h> |
||
15 | |||
16 | /* builtin charmaps */ |
||
17 | #include "charmaps.h" |
||
18 | |||
19 | /* only 0-7 are valid as dest charset */ |
||
20 | #define UTF_16BE 000 |
||
21 | #define UTF_16LE 001 |
||
22 | #define UTF_32BE 002 |
||
23 | #define UTF_32LE 003 |
||
24 | #define WCHAR_T 004 |
||
25 | #define UTF_8 005 |
||
26 | #define US_ASCII 006 |
||
27 | #define LATIN_1 007 |
||
28 | |||
29 | /* additional charsets with algorithmic conversion */ |
||
30 | #define LATIN_9 010 |
||
31 | #define TIS_620 011 |
||
32 | #define JIS_0201 012 |
||
33 | |||
34 | /* some programs like php need this */ |
||
35 | int _libiconv_version = _LIBICONV_VERSION; |
||
36 | |||
37 | /* these must match the constants above */ |
||
38 | static const unsigned char charsets[] = |
||
39 | "\005" "UTF-8" "\0" |
||
40 | "\004" "WCHAR_T" "\0" |
||
41 | "\000" "UTF-16BE" "\0" |
||
42 | "\001" "UTF-16LE" "\0" |
||
43 | "\002" "UTF-32BE" "\0" |
||
44 | "\003" "UTF-32LE" "\0" |
||
45 | "\006" "ASCII" "\0" |
||
46 | "\006" "US-ASCII" "\0" |
||
47 | "\006" "ISO646-US" "\0" |
||
48 | "\006" "ISO_646.IRV:1991" "\0" |
||
49 | "\006" "ISO-IR-6" "\0" |
||
50 | "\006" "ANSI_X3.4-1968" "\0" |
||
51 | "\006" "ANSI_X3.4-1986" "\0" |
||
52 | "\006" "CP367" "\0" |
||
53 | "\006" "IBM367" "\0" |
||
54 | "\006" "US" "\0" |
||
55 | "\006" "CSASCII" "\0" |
||
56 | "\007" "ISO-8859-1" "\0" |
||
57 | "\007" "LATIN1" "\0" |
||
58 | "\010" "ISO-8859-15""\0" |
||
59 | "\010" "LATIN9" "\0" |
||
60 | "\011" "ISO-8859-11""\0" |
||
61 | "\011" "TIS-620" "\0" |
||
62 | "\012" "JIS-0201" "\0" |
||
63 | "\377"; |
||
64 | |||
65 | /* separate identifiers for sbcs/dbcs/etc map type */ |
||
66 | #define UCS2_8BIT 000 |
||
67 | #define UCS3_8BIT 001 |
||
68 | #define EUC 002 |
||
69 | #define EUC_TW 003 |
||
70 | #define SHIFT_JIS 004 |
||
71 | #define BIG5 005 |
||
72 | #define GBK 006 |
||
73 | |||
74 | /* FIXME: these are not implemented yet |
||
75 | // EUC: A1-FE A1-FE |
||
76 | // GBK: 81-FE 40-7E,80-FE |
||
77 | // Big5: A1-FE 40-7E,A1-FE |
||
78 | */ |
||
79 | |||
80 | static const unsigned short maplen[] = { |
||
81 | [UCS2_8BIT] = 4+ 2* 128, |
||
82 | [UCS3_8BIT] = 4+ 3* 128, |
||
83 | [EUC] = 4+ 2* 94*94, |
||
84 | [SHIFT_JIS] = 4+ 2* 94*94, |
||
85 | [BIG5] = 4+ 2* 94*157, |
||
86 | [GBK] = 4+ 2* 126*190, |
||
87 | [EUC_TW] = 4+ 2* 2*94*94, |
||
88 | }; |
||
89 | |||
90 | static int find_charmap(const char *name) |
||
91 | { |
||
92 | int i; |
||
93 | for (i = 0; i < (sizeof(charmaps) / sizeof(charmaps[0])); i++) |
||
94 | if (!strcasecmp(charmaps[i].name, name)) |
||
95 | return i; |
||
96 | return -1; |
||
97 | } |
||
98 | |||
99 | static int find_charset(const char *name) |
||
100 | { |
||
101 | const unsigned char *s; |
||
102 | for (s=charsets; *s<0xff && strcasecmp(s+1, name); s+=strlen(s)+1); |
||
103 | return *s; |
||
104 | } |
||
105 | |||
106 | iconv_t iconv_open(const char *to, const char *from) |
||
107 | { |
||
108 | unsigned f, t; |
||
109 | int m; |
||
110 | |||
111 | if ((t = find_charset(to)) > 8) |
||
112 | return -1; |
||
113 | |||
114 | if ((f = find_charset(from)) < 255) |
||
115 | return 0 | (t<<1) | (f<<8); |
||
116 | |||
117 | if ((m = find_charmap(from)) > -1) |
||
118 | return 1 | (t<<1) | (m<<8); |
||
119 | |||
120 | return -1; |
||
121 | } |
||
122 | |||
123 | int iconv_close(iconv_t cd) |
||
124 | { |
||
125 | return 0; |
||
126 | } |
||
127 | |||
128 | static inline wchar_t get_16(const unsigned char *s, int endian) |
||
129 | { |
||
130 | endian &= 1; |
||
131 | return s[endian]<<8 | s[endian^1]; |
||
132 | } |
||
133 | |||
134 | static inline void put_16(unsigned char *s, wchar_t c, int endian) |
||
135 | { |
||
136 | endian &= 1; |
||
137 | s[endian] = c>>8; |
||
138 | s[endian^1] = c; |
||
139 | } |
||
140 | |||
141 | static inline int utf8enc_wchar(char *outb, wchar_t c) |
||
142 | { |
||
143 | if (c <= 0x7F) { |
||
144 | *outb = c; |
||
145 | return 1; |
||
146 | } |
||
147 | else if (c <= 0x7FF) { |
||
148 | *outb++ = ((c >> 6) & 0x1F) | 0xC0; |
||
149 | *outb++ = ( c & 0x3F) | 0x80; |
||
150 | return 2; |
||
151 | } |
||
152 | else if (c <= 0xFFFF) { |
||
153 | *outb++ = ((c >> 12) & 0x0F) | 0xE0; |
||
154 | *outb++ = ((c >> 6) & 0x3F) | 0x80; |
||
155 | *outb++ = ( c & 0x3F) | 0x80; |
||
156 | return 3; |
||
157 | } |
||
158 | else if (c <= 0x10FFFF) { |
||
159 | *outb++ = ((c >> 18) & 0x07) | 0xF0; |
||
160 | *outb++ = ((c >> 12) & 0x3F) | 0x80; |
||
161 | *outb++ = ((c >> 6) & 0x3F) | 0x80; |
||
162 | *outb++ = ( c & 0x3F) | 0x80; |
||
163 | return 4; |
||
164 | } |
||
165 | else { |
||
166 | *outb++ = '?'; |
||
167 | return 1; |
||
168 | } |
||
169 | } |
||
170 | |||
171 | static inline int utf8seq_is_overlong(char *s, int n) |
||
172 | { |
||
173 | switch (n) |
||
174 | { |
||
175 | case 2: |
||
176 | /* 1100000x (10xxxxxx) */ |
||
177 | return (((*s >> 1) == 0x60) && |
||
178 | ((*(s+1) >> 6) == 0x02)); |
||
179 | |||
180 | case 3: |
||
181 | /* 11100000 100xxxxx (10xxxxxx) */ |
||
182 | return ((*s == 0xE0) && |
||
183 | ((*(s+1) >> 5) == 0x04) && |
||
184 | ((*(s+2) >> 6) == 0x02)); |
||
185 | |||
186 | case 4: |
||
187 | /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */ |
||
188 | return ((*s == 0xF0) && |
||
189 | ((*(s+1) >> 4) == 0x08) && |
||
190 | ((*(s+2) >> 6) == 0x02) && |
||
191 | ((*(s+3) >> 6) == 0x02)); |
||
192 | } |
||
193 | |||
194 | return 0; |
||
195 | } |
||
196 | |||
197 | static inline int utf8seq_is_surrogate(char *s, int n) |
||
198 | { |
||
199 | return ((n == 3) && (*s == 0xED) && (*(s+1) >= 0xA0) && (*(s+1) <= 0xBF)); |
||
200 | } |
||
201 | |||
202 | static inline int utf8seq_is_illegal(char *s, int n) |
||
203 | { |
||
204 | return ((n == 3) && (*s == 0xEF) && (*(s+1) == 0xBF) && |
||
205 | (*(s+2) >= 0xBE) && (*(s+2) <= 0xBF)); |
||
206 | } |
||
207 | |||
208 | static inline int utf8dec_wchar(wchar_t *c, unsigned char *in, size_t inb) |
||
209 | { |
||
210 | int i; |
||
211 | int n = -1; |
||
212 | |||
213 | /* trivial char */ |
||
214 | if (*in <= 0x7F) { |
||
215 | *c = *in; |
||
216 | return 1; |
||
217 | } |
||
218 | |||
219 | /* find utf8 sequence length */ |
||
220 | if ((*in & 0xE0) == 0xC0) n = 2; |
||
221 | else if ((*in & 0xF0) == 0xE0) n = 3; |
||
222 | else if ((*in & 0xF8) == 0xF0) n = 4; |
||
223 | else if ((*in & 0xFC) == 0xF8) n = 5; |
||
224 | else if ((*in & 0xFE) == 0xFC) n = 6; |
||
225 | |||
226 | /* starved? */ |
||
227 | if (n > inb) |
||
228 | return -2; |
||
229 | |||
230 | /* decode ... */ |
||
231 | if (n > 1 && n < 5) { |
||
232 | /* reject invalid sequences */ |
||
233 | if (utf8seq_is_overlong(in, n) || |
||
234 | utf8seq_is_surrogate(in, n) || |
||
235 | utf8seq_is_illegal(in, n)) |
||
236 | return -1; |
||
237 | |||
238 | /* decode ... */ |
||
239 | *c = (char)(*in++ & (0x7F >> n)); |
||
240 | |||
241 | for (i = 1; i < n; i++) { |
||
242 | /* illegal continuation byte */ |
||
243 | if (*in < 0x80 || *in > 0xBF) |
||
244 | return -1; |
||
245 | |||
246 | *c = (*c << 6) | (*in++ & 0x3F); |
||
247 | } |
||
248 | |||
249 | return n; |
||
250 | } |
||
251 | |||
252 | /* unmapped sequence (> 4) */ |
||
253 | return -1; |
||
254 | } |
||
255 | |||
256 | static inline wchar_t latin9_translit(wchar_t c) |
||
257 | { |
||
258 | /* a number of trivial iso-8859-15 <> utf-8 transliterations */ |
||
259 | switch (c) { |
||
260 | case 0x20AC: return 0xA4; /* Euro */ |
||
261 | case 0x0160: return 0xA6; /* S caron */ |
||
262 | case 0x0161: return 0xA8; /* s caron */ |
||
263 | case 0x017D: return 0xB4; /* Z caron */ |
||
264 | case 0x017E: return 0xB8; /* z caron */ |
||
265 | case 0x0152: return 0xBC; /* OE */ |
||
266 | case 0x0153: return 0xBD; /* oe */ |
||
267 | case 0x0178: return 0xBE; /* Y diaeresis */ |
||
268 | default: return 0xFFFD; /* cannot translate */ |
||
269 | } |
||
270 | } |
||
271 | |||
272 | size_t iconv(iconv_t cd, char **in, size_t *inb, char **out, size_t *outb) |
||
273 | { |
||
274 | size_t x=0; |
||
275 | unsigned char to = (cd>>1)&127; |
||
276 | unsigned char from = 255; |
||
277 | const unsigned char *map = 0; |
||
278 | char tmp[MB_LEN_MAX]; |
||
279 | wchar_t c, d; |
||
280 | size_t k, l; |
||
281 | int err; |
||
282 | |||
283 | if (!in || !*in || !*inb) return 0; |
||
284 | |||
285 | if (cd & 1) |
||
286 | map = charmaps[cd>>8].map; |
||
287 | else |
||
288 | from = cd>>8; |
||
289 | |||
290 | for (; *inb; *in+=l, *inb-=l) { |
||
291 | c = *(unsigned char *)*in; |
||
292 | l = 1; |
||
293 | if (from >= UTF_8 && c < 0x80) goto charok; |
||
294 | switch (from) { |
||
295 | case WCHAR_T: |
||
296 | l = sizeof(wchar_t); |
||
297 | if (*inb < l) goto starved; |
||
298 | c = *(wchar_t *)*in; |
||
299 | break; |
||
300 | case UTF_8: |
||
301 | l = utf8dec_wchar(&c, *in, *inb); |
||
302 | if (!l) l++; |
||
303 | else if (l == (size_t)-1) goto ilseq; |
||
304 | else if (l == (size_t)-2) goto starved; |
||
305 | break; |
||
306 | case US_ASCII: |
||
307 | goto ilseq; |
||
308 | case LATIN_9: |
||
309 | if ((unsigned)c - 0xa4 <= 0xbe - 0xa4) { |
||
310 | static const unsigned char map[] = { |
||
311 | 0, 0x60, 0, 0x61, 0, 0, 0, 0, 0, 0, 0, |
||
312 | 0, 0, 0, 0, 0x7d, 0, 0, 0, 0x7e, 0, 0, 0, |
||
313 | 0x52, 0x53, 0x78 |
||
314 | }; |
||
315 | if (c == 0xa4) c = 0x20ac; |
||
316 | else if (map[c-0xa5]) c = 0x100 | map[c-0xa5]; |
||
317 | } |
||
318 | case LATIN_1: |
||
319 | goto charok; |
||
320 | case TIS_620: |
||
321 | if (c >= 0xa1) c += 0x0e01-0xa1; |
||
322 | goto charok; |
||
323 | case JIS_0201: |
||
324 | if (c >= 0xa1) { |
||
325 | if (c <= 0xdf) c += 0xff61-0xa1; |
||
326 | else goto ilseq; |
||
327 | } |
||
328 | goto charok; |
||
329 | case UTF_16BE: |
||
330 | case UTF_16LE: |
||
331 | l = 2; |
||
332 | if (*inb < 2) goto starved; |
||
333 | c = get_16(*in, from); |
||
334 | if ((unsigned)(c-0xdc00) < 0x400) goto ilseq; |
||
335 | if ((unsigned)(c-0xd800) < 0x400) { |
||
336 | l = 4; |
||
337 | if (*inb < 4) goto starved; |
||
338 | d = get_16(*in + 2, from); |
||
339 | if ((unsigned)(c-0xdc00) >= 0x400) goto ilseq; |
||
340 | c = ((c-0xd800)<<10) | (d-0xdc00); |
||
341 | } |
||
342 | break; |
||
343 | case UTF_32BE: |
||
344 | case UTF_32LE: |
||
345 | l = 4; |
||
346 | if (*inb < 4) goto starved; |
||
347 | // FIXME |
||
348 | // c = get_32(*in, from); |
||
349 | break; |
||
350 | default: |
||
351 | /* only support ascii supersets */ |
||
352 | if (c < 0x80) break; |
||
353 | switch (map[0]) { |
||
354 | case UCS2_8BIT: |
||
355 | c -= 0x80; |
||
356 | break; |
||
357 | case EUC: |
||
358 | if ((unsigned)c - 0xa1 >= 94) goto ilseq; |
||
359 | if ((unsigned)in[0][1] - 0xa1 >= 94) goto ilseq; |
||
360 | c = (c-0xa1)*94 + (in[0][1]-0xa1); |
||
361 | l = 2; |
||
362 | break; |
||
363 | case SHIFT_JIS: |
||
364 | if ((unsigned)c - 0xa1 <= 0xdf-0xa1) { |
||
365 | c += 0xff61-0xa1; |
||
366 | goto charok; |
||
367 | } |
||
368 | // FIXME... |
||
369 | l = 2; |
||
370 | break; |
||
371 | default: |
||
372 | goto badf; |
||
373 | } |
||
374 | c = get_16(map + 4 + 2*c, 0); |
||
375 | if (c == 0xffff) goto ilseq; |
||
376 | goto charok; |
||
377 | } |
||
378 | |||
379 | if ((unsigned)c - 0xd800 < 0x800 || (unsigned)c >= 0x110000) |
||
380 | goto ilseq; |
||
381 | charok: |
||
382 | switch (to) { |
||
383 | case WCHAR_T: |
||
384 | if (*outb < sizeof(wchar_t)) goto toobig; |
||
385 | *(wchar_t *)*out = c; |
||
386 | *out += sizeof(wchar_t); |
||
387 | *outb -= sizeof(wchar_t); |
||
388 | break; |
||
389 | case UTF_8: |
||
390 | if (*outb < 4) { |
||
391 | k = utf8enc_wchar(tmp, c); |
||
392 | if (*outb < k) goto toobig; |
||
393 | memcpy(*out, tmp, k); |
||
394 | } else k = utf8enc_wchar(*out, c); |
||
395 | *out += k; |
||
396 | *outb -= k; |
||
397 | break; |
||
398 | case US_ASCII: |
||
399 | if (c > 0x7f) c = 0xfffd; |
||
400 | /* fall thru and count replacement in latin1 case */ |
||
401 | case LATIN_9: |
||
402 | if (c >= 0x100 && c != 0xfffd) |
||
403 | c = latin9_translit(c); |
||
404 | /* fall through */ |
||
405 | case LATIN_1: |
||
406 | if (c > 0xff) goto ilseq; |
||
407 | if (!*outb) goto toobig; |
||
408 | **out = c; |
||
409 | ++*out; |
||
410 | --*outb; |
||
411 | break; |
||
412 | case UTF_16BE: |
||
413 | case UTF_16LE: |
||
414 | if (c < 0x10000) { |
||
415 | if (*outb < 2) goto toobig; |
||
416 | put_16(*out, c, to); |
||
417 | *out += 2; |
||
418 | *outb -= 2; |
||
419 | break; |
||
420 | } |
||
421 | if (*outb < 4) goto toobig; |
||
422 | put_16(*out, (c>>10)|0xd800, to); |
||
423 | put_16(*out + 2, (c&0x3ff)|0xdc00, to); |
||
424 | *out += 4; |
||
425 | *outb -= 4; |
||
426 | break; |
||
427 | default: |
||
428 | goto badf; |
||
429 | } |
||
430 | } |
||
431 | return x; |
||
432 | ilseq: |
||
433 | err = EILSEQ; |
||
434 | x = -1; |
||
435 | goto end; |
||
436 | badf: |
||
437 | err = EBADF; |
||
438 | x = -1; |
||
439 | goto end; |
||
440 | toobig: |
||
441 | err = E2BIG; |
||
442 | x = -1; |
||
443 | goto end; |
||
444 | starved: |
||
445 | err = EINVAL; |
||
446 | end: |
||
447 | errno = err; |
||
448 | return x; |
||
449 | } |