nexmon – Blame information for rev 1
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
1 | office | 1 | /* gutf8.c - Operations on UTF-8 strings. |
2 | * |
||
3 | * Copyright (C) 1999 Tom Tromey |
||
4 | * Copyright (C) 2000 Red Hat, Inc. |
||
5 | * |
||
6 | * This library is free software; you can redistribute it and/or |
||
7 | * modify it under the terms of the GNU Lesser General Public |
||
8 | * License as published by the Free Software Foundation; either |
||
9 | * version 2 of the License, or (at your option) any later version. |
||
10 | * |
||
11 | * This library is distributed in the hope that it will be useful, |
||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
14 | * Lesser General Public License for more details. |
||
15 | * |
||
16 | * You should have received a copy of the GNU Lesser General Public |
||
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
||
18 | */ |
||
19 | |||
20 | #include "config.h" |
||
21 | |||
22 | #include <stdlib.h> |
||
23 | #ifdef HAVE_CODESET |
||
24 | #include <langinfo.h> |
||
25 | #endif |
||
26 | #include <string.h> |
||
27 | |||
28 | #ifdef G_PLATFORM_WIN32 |
||
29 | #include <stdio.h> |
||
30 | #define STRICT |
||
31 | #include <windows.h> |
||
32 | #undef STRICT |
||
33 | #endif |
||
34 | |||
35 | #include "gconvert.h" |
||
36 | #include "ghash.h" |
||
37 | #include "gstrfuncs.h" |
||
38 | #include "gtestutils.h" |
||
39 | #include "gtypes.h" |
||
40 | #include "gthread.h" |
||
41 | #include "glibintl.h" |
||
42 | |||
43 | #define UTF8_COMPUTE(Char, Mask, Len) \ |
||
44 | if (Char < 128) \ |
||
45 | { \ |
||
46 | Len = 1; \ |
||
47 | Mask = 0x7f; \ |
||
48 | } \ |
||
49 | else if ((Char & 0xe0) == 0xc0) \ |
||
50 | { \ |
||
51 | Len = 2; \ |
||
52 | Mask = 0x1f; \ |
||
53 | } \ |
||
54 | else if ((Char & 0xf0) == 0xe0) \ |
||
55 | { \ |
||
56 | Len = 3; \ |
||
57 | Mask = 0x0f; \ |
||
58 | } \ |
||
59 | else if ((Char & 0xf8) == 0xf0) \ |
||
60 | { \ |
||
61 | Len = 4; \ |
||
62 | Mask = 0x07; \ |
||
63 | } \ |
||
64 | else if ((Char & 0xfc) == 0xf8) \ |
||
65 | { \ |
||
66 | Len = 5; \ |
||
67 | Mask = 0x03; \ |
||
68 | } \ |
||
69 | else if ((Char & 0xfe) == 0xfc) \ |
||
70 | { \ |
||
71 | Len = 6; \ |
||
72 | Mask = 0x01; \ |
||
73 | } \ |
||
74 | else \ |
||
75 | Len = -1; |
||
76 | |||
77 | #define UTF8_LENGTH(Char) \ |
||
78 | ((Char) < 0x80 ? 1 : \ |
||
79 | ((Char) < 0x800 ? 2 : \ |
||
80 | ((Char) < 0x10000 ? 3 : \ |
||
81 | ((Char) < 0x200000 ? 4 : \ |
||
82 | ((Char) < 0x4000000 ? 5 : 6))))) |
||
83 | |||
84 | |||
85 | #define UTF8_GET(Result, Chars, Count, Mask, Len) \ |
||
86 | (Result) = (Chars)[0] & (Mask); \ |
||
87 | for ((Count) = 1; (Count) < (Len); ++(Count)) \ |
||
88 | { \ |
||
89 | if (((Chars)[(Count)] & 0xc0) != 0x80) \ |
||
90 | { \ |
||
91 | (Result) = -1; \ |
||
92 | break; \ |
||
93 | } \ |
||
94 | (Result) <<= 6; \ |
||
95 | (Result) |= ((Chars)[(Count)] & 0x3f); \ |
||
96 | } |
||
97 | |||
98 | /* |
||
99 | * Check whether a Unicode (5.2) char is in a valid range. |
||
100 | * |
||
101 | * The first check comes from the Unicode guarantee to never encode |
||
102 | * a point above 0x0010ffff, since UTF-16 couldn't represent it. |
||
103 | * |
||
104 | * The second check covers surrogate pairs (category Cs). |
||
105 | * |
||
106 | * @param Char the character |
||
107 | */ |
||
108 | #define UNICODE_VALID(Char) \ |
||
109 | ((Char) < 0x110000 && \ |
||
110 | (((Char) & 0xFFFFF800) != 0xD800)) |
||
111 | |||
112 | |||
113 | static const gchar utf8_skip_data[256] = { |
||
114 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
||
115 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
||
116 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
||
117 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
||
118 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
||
119 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
||
120 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
||
121 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 |
||
122 | }; |
||
123 | |||
124 | const gchar * const g_utf8_skip = utf8_skip_data; |
||
125 | |||
126 | /** |
||
127 | * g_utf8_find_prev_char: |
||
128 | * @str: pointer to the beginning of a UTF-8 encoded string |
||
129 | * @p: pointer to some position within @str |
||
130 | * |
||
131 | * Given a position @p with a UTF-8 encoded string @str, find the start |
||
132 | * of the previous UTF-8 character starting before @p. Returns %NULL if no |
||
133 | * UTF-8 characters are present in @str before @p. |
||
134 | * |
||
135 | * @p does not have to be at the beginning of a UTF-8 character. No check |
||
136 | * is made to see if the character found is actually valid other than |
||
137 | * it starts with an appropriate byte. |
||
138 | * |
||
139 | * Returns: a pointer to the found character or %NULL. |
||
140 | */ |
||
141 | gchar * |
||
142 | g_utf8_find_prev_char (const char *str, |
||
143 | const char *p) |
||
144 | { |
||
145 | for (--p; p >= str; --p) |
||
146 | { |
||
147 | if ((*p & 0xc0) != 0x80) |
||
148 | return (gchar *)p; |
||
149 | } |
||
150 | return NULL; |
||
151 | } |
||
152 | |||
153 | /** |
||
154 | * g_utf8_find_next_char: |
||
155 | * @p: a pointer to a position within a UTF-8 encoded string |
||
156 | * @end: (nullable): a pointer to the byte following the end of the string, |
||
157 | * or %NULL to indicate that the string is nul-terminated |
||
158 | * |
||
159 | * Finds the start of the next UTF-8 character in the string after @p. |
||
160 | * |
||
161 | * @p does not have to be at the beginning of a UTF-8 character. No check |
||
162 | * is made to see if the character found is actually valid other than |
||
163 | * it starts with an appropriate byte. |
||
164 | * |
||
165 | * Returns: a pointer to the found character or %NULL |
||
166 | */ |
||
167 | gchar * |
||
168 | g_utf8_find_next_char (const gchar *p, |
||
169 | const gchar *end) |
||
170 | { |
||
171 | if (*p) |
||
172 | { |
||
173 | if (end) |
||
174 | for (++p; p < end && (*p & 0xc0) == 0x80; ++p) |
||
175 | ; |
||
176 | else |
||
177 | for (++p; (*p & 0xc0) == 0x80; ++p) |
||
178 | ; |
||
179 | } |
||
180 | return (p == end) ? NULL : (gchar *)p; |
||
181 | } |
||
182 | |||
183 | /** |
||
184 | * g_utf8_prev_char: |
||
185 | * @p: a pointer to a position within a UTF-8 encoded string |
||
186 | * |
||
187 | * Finds the previous UTF-8 character in the string before @p. |
||
188 | * |
||
189 | * @p does not have to be at the beginning of a UTF-8 character. No check |
||
190 | * is made to see if the character found is actually valid other than |
||
191 | * it starts with an appropriate byte. If @p might be the first |
||
192 | * character of the string, you must use g_utf8_find_prev_char() instead. |
||
193 | * |
||
194 | * Returns: a pointer to the found character |
||
195 | */ |
||
196 | gchar * |
||
197 | g_utf8_prev_char (const gchar *p) |
||
198 | { |
||
199 | while (TRUE) |
||
200 | { |
||
201 | p--; |
||
202 | if ((*p & 0xc0) != 0x80) |
||
203 | return (gchar *)p; |
||
204 | } |
||
205 | } |
||
206 | |||
207 | /** |
||
208 | * g_utf8_strlen: |
||
209 | * @p: pointer to the start of a UTF-8 encoded string |
||
210 | * @max: the maximum number of bytes to examine. If @max |
||
211 | * is less than 0, then the string is assumed to be |
||
212 | * nul-terminated. If @max is 0, @p will not be examined and |
||
213 | * may be %NULL. If @max is greater than 0, up to @max |
||
214 | * bytes are examined |
||
215 | * |
||
216 | * Computes the length of the string in characters, not including |
||
217 | * the terminating nul character. If the @max'th byte falls in the |
||
218 | * middle of a character, the last (partial) character is not counted. |
||
219 | * |
||
220 | * Returns: the length of the string in characters |
||
221 | */ |
||
222 | glong |
||
223 | g_utf8_strlen (const gchar *p, |
||
224 | gssize max) |
||
225 | { |
||
226 | glong len = 0; |
||
227 | const gchar *start = p; |
||
228 | g_return_val_if_fail (p != NULL || max == 0, 0); |
||
229 | |||
230 | if (max < 0) |
||
231 | { |
||
232 | while (*p) |
||
233 | { |
||
234 | p = g_utf8_next_char (p); |
||
235 | ++len; |
||
236 | } |
||
237 | } |
||
238 | else |
||
239 | { |
||
240 | if (max == 0 || !*p) |
||
241 | return 0; |
||
242 | |||
243 | p = g_utf8_next_char (p); |
||
244 | |||
245 | while (p - start < max && *p) |
||
246 | { |
||
247 | ++len; |
||
248 | p = g_utf8_next_char (p); |
||
249 | } |
||
250 | |||
251 | /* only do the last len increment if we got a complete |
||
252 | * char (don't count partial chars) |
||
253 | */ |
||
254 | if (p - start <= max) |
||
255 | ++len; |
||
256 | } |
||
257 | |||
258 | return len; |
||
259 | } |
||
260 | |||
261 | /** |
||
262 | * g_utf8_substring: |
||
263 | * @str: a UTF-8 encoded string |
||
264 | * @start_pos: a character offset within @str |
||
265 | * @end_pos: another character offset within @str |
||
266 | * |
||
267 | * Copies a substring out of a UTF-8 encoded string. |
||
268 | * The substring will contain @end_pos - @start_pos characters. |
||
269 | * |
||
270 | * Returns: a newly allocated copy of the requested |
||
271 | * substring. Free with g_free() when no longer needed. |
||
272 | * |
||
273 | * Since: 2.30 |
||
274 | */ |
||
275 | gchar * |
||
276 | g_utf8_substring (const gchar *str, |
||
277 | glong start_pos, |
||
278 | glong end_pos) |
||
279 | { |
||
280 | gchar *start, *end, *out; |
||
281 | |||
282 | start = g_utf8_offset_to_pointer (str, start_pos); |
||
283 | end = g_utf8_offset_to_pointer (start, end_pos - start_pos); |
||
284 | |||
285 | out = g_malloc (end - start + 1); |
||
286 | memcpy (out, start, end - start); |
||
287 | out[end - start] = 0; |
||
288 | |||
289 | return out; |
||
290 | } |
||
291 | |||
292 | /** |
||
293 | * g_utf8_get_char: |
||
294 | * @p: a pointer to Unicode character encoded as UTF-8 |
||
295 | * |
||
296 | * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. |
||
297 | * |
||
298 | * If @p does not point to a valid UTF-8 encoded character, results |
||
299 | * are undefined. If you are not sure that the bytes are complete |
||
300 | * valid Unicode characters, you should use g_utf8_get_char_validated() |
||
301 | * instead. |
||
302 | * |
||
303 | * Returns: the resulting character |
||
304 | */ |
||
305 | gunichar |
||
306 | g_utf8_get_char (const gchar *p) |
||
307 | { |
||
308 | int i, mask = 0, len; |
||
309 | gunichar result; |
||
310 | unsigned char c = (unsigned char) *p; |
||
311 | |||
312 | UTF8_COMPUTE (c, mask, len); |
||
313 | if (len == -1) |
||
314 | return (gunichar)-1; |
||
315 | UTF8_GET (result, p, i, mask, len); |
||
316 | |||
317 | return result; |
||
318 | } |
||
319 | |||
320 | /** |
||
321 | * g_utf8_offset_to_pointer: |
||
322 | * @str: a UTF-8 encoded string |
||
323 | * @offset: a character offset within @str |
||
324 | * |
||
325 | * Converts from an integer character offset to a pointer to a position |
||
326 | * within the string. |
||
327 | * |
||
328 | * Since 2.10, this function allows to pass a negative @offset to |
||
329 | * step backwards. It is usually worth stepping backwards from the end |
||
330 | * instead of forwards if @offset is in the last fourth of the string, |
||
331 | * since moving forward is about 3 times faster than moving backward. |
||
332 | * |
||
333 | * Note that this function doesn't abort when reaching the end of @str. |
||
334 | * Therefore you should be sure that @offset is within string boundaries |
||
335 | * before calling that function. Call g_utf8_strlen() when unsure. |
||
336 | * This limitation exists as this function is called frequently during |
||
337 | * text rendering and therefore has to be as fast as possible. |
||
338 | * |
||
339 | * Returns: the resulting pointer |
||
340 | */ |
||
341 | gchar * |
||
342 | g_utf8_offset_to_pointer (const gchar *str, |
||
343 | glong offset) |
||
344 | { |
||
345 | const gchar *s = str; |
||
346 | |||
347 | if (offset > 0) |
||
348 | while (offset--) |
||
349 | s = g_utf8_next_char (s); |
||
350 | else |
||
351 | { |
||
352 | const char *s1; |
||
353 | |||
354 | /* This nice technique for fast backwards stepping |
||
355 | * through a UTF-8 string was dubbed "stutter stepping" |
||
356 | * by its inventor, Larry Ewing. |
||
357 | */ |
||
358 | while (offset) |
||
359 | { |
||
360 | s1 = s; |
||
361 | s += offset; |
||
362 | while ((*s & 0xc0) == 0x80) |
||
363 | s--; |
||
364 | |||
365 | offset += g_utf8_pointer_to_offset (s, s1); |
||
366 | } |
||
367 | } |
||
368 | |||
369 | return (gchar *)s; |
||
370 | } |
||
371 | |||
372 | /** |
||
373 | * g_utf8_pointer_to_offset: |
||
374 | * @str: a UTF-8 encoded string |
||
375 | * @pos: a pointer to a position within @str |
||
376 | * |
||
377 | * Converts from a pointer to position within a string to a integer |
||
378 | * character offset. |
||
379 | * |
||
380 | * Since 2.10, this function allows @pos to be before @str, and returns |
||
381 | * a negative offset in this case. |
||
382 | * |
||
383 | * Returns: the resulting character offset |
||
384 | */ |
||
385 | glong |
||
386 | g_utf8_pointer_to_offset (const gchar *str, |
||
387 | const gchar *pos) |
||
388 | { |
||
389 | const gchar *s = str; |
||
390 | glong offset = 0; |
||
391 | |||
392 | if (pos < str) |
||
393 | offset = - g_utf8_pointer_to_offset (pos, str); |
||
394 | else |
||
395 | while (s < pos) |
||
396 | { |
||
397 | s = g_utf8_next_char (s); |
||
398 | offset++; |
||
399 | } |
||
400 | |||
401 | return offset; |
||
402 | } |
||
403 | |||
404 | |||
405 | /** |
||
406 | * g_utf8_strncpy: |
||
407 | * @dest: buffer to fill with characters from @src |
||
408 | * @src: UTF-8 encoded string |
||
409 | * @n: character count |
||
410 | * |
||
411 | * Like the standard C strncpy() function, but copies a given number |
||
412 | * of characters instead of a given number of bytes. The @src string |
||
413 | * must be valid UTF-8 encoded text. (Use g_utf8_validate() on all |
||
414 | * text before trying to use UTF-8 utility functions with it.) |
||
415 | * |
||
416 | * Returns: @dest |
||
417 | */ |
||
418 | gchar * |
||
419 | g_utf8_strncpy (gchar *dest, |
||
420 | const gchar *src, |
||
421 | gsize n) |
||
422 | { |
||
423 | const gchar *s = src; |
||
424 | while (n && *s) |
||
425 | { |
||
426 | s = g_utf8_next_char(s); |
||
427 | n--; |
||
428 | } |
||
429 | strncpy(dest, src, s - src); |
||
430 | dest[s - src] = 0; |
||
431 | return dest; |
||
432 | } |
||
433 | |||
434 | /* unicode_strchr */ |
||
435 | |||
436 | /** |
||
437 | * g_unichar_to_utf8: |
||
438 | * @c: a Unicode character code |
||
439 | * @outbuf: (out caller-allocates) (optional): output buffer, must have at |
||
440 | * least 6 bytes of space. If %NULL, the length will be computed and |
||
441 | * returned and nothing will be written to @outbuf. |
||
442 | * |
||
443 | * Converts a single character to UTF-8. |
||
444 | * |
||
445 | * Returns: number of bytes written |
||
446 | */ |
||
447 | int |
||
448 | g_unichar_to_utf8 (gunichar c, |
||
449 | gchar *outbuf) |
||
450 | { |
||
451 | /* If this gets modified, also update the copy in g_string_insert_unichar() */ |
||
452 | guint len = 0; |
||
453 | int first; |
||
454 | int i; |
||
455 | |||
456 | if (c < 0x80) |
||
457 | { |
||
458 | first = 0; |
||
459 | len = 1; |
||
460 | } |
||
461 | else if (c < 0x800) |
||
462 | { |
||
463 | first = 0xc0; |
||
464 | len = 2; |
||
465 | } |
||
466 | else if (c < 0x10000) |
||
467 | { |
||
468 | first = 0xe0; |
||
469 | len = 3; |
||
470 | } |
||
471 | else if (c < 0x200000) |
||
472 | { |
||
473 | first = 0xf0; |
||
474 | len = 4; |
||
475 | } |
||
476 | else if (c < 0x4000000) |
||
477 | { |
||
478 | first = 0xf8; |
||
479 | len = 5; |
||
480 | } |
||
481 | else |
||
482 | { |
||
483 | first = 0xfc; |
||
484 | len = 6; |
||
485 | } |
||
486 | |||
487 | if (outbuf) |
||
488 | { |
||
489 | for (i = len - 1; i > 0; --i) |
||
490 | { |
||
491 | outbuf[i] = (c & 0x3f) | 0x80; |
||
492 | c >>= 6; |
||
493 | } |
||
494 | outbuf[0] = c | first; |
||
495 | } |
||
496 | |||
497 | return len; |
||
498 | } |
||
499 | |||
500 | /** |
||
501 | * g_utf8_strchr: |
||
502 | * @p: a nul-terminated UTF-8 encoded string |
||
503 | * @len: the maximum length of @p |
||
504 | * @c: a Unicode character |
||
505 | * |
||
506 | * Finds the leftmost occurrence of the given Unicode character |
||
507 | * in a UTF-8 encoded string, while limiting the search to @len bytes. |
||
508 | * If @len is -1, allow unbounded search. |
||
509 | * |
||
510 | * Returns: %NULL if the string does not contain the character, |
||
511 | * otherwise, a pointer to the start of the leftmost occurrence |
||
512 | * of the character in the string. |
||
513 | */ |
||
514 | gchar * |
||
515 | g_utf8_strchr (const char *p, |
||
516 | gssize len, |
||
517 | gunichar c) |
||
518 | { |
||
519 | gchar ch[10]; |
||
520 | |||
521 | gint charlen = g_unichar_to_utf8 (c, ch); |
||
522 | ch[charlen] = '\0'; |
||
523 | |||
524 | return g_strstr_len (p, len, ch); |
||
525 | } |
||
526 | |||
527 | |||
528 | /** |
||
529 | * g_utf8_strrchr: |
||
530 | * @p: a nul-terminated UTF-8 encoded string |
||
531 | * @len: the maximum length of @p |
||
532 | * @c: a Unicode character |
||
533 | * |
||
534 | * Find the rightmost occurrence of the given Unicode character |
||
535 | * in a UTF-8 encoded string, while limiting the search to @len bytes. |
||
536 | * If @len is -1, allow unbounded search. |
||
537 | * |
||
538 | * Returns: %NULL if the string does not contain the character, |
||
539 | * otherwise, a pointer to the start of the rightmost occurrence |
||
540 | * of the character in the string. |
||
541 | */ |
||
542 | gchar * |
||
543 | g_utf8_strrchr (const char *p, |
||
544 | gssize len, |
||
545 | gunichar c) |
||
546 | { |
||
547 | gchar ch[10]; |
||
548 | |||
549 | gint charlen = g_unichar_to_utf8 (c, ch); |
||
550 | ch[charlen] = '\0'; |
||
551 | |||
552 | return g_strrstr_len (p, len, ch); |
||
553 | } |
||
554 | |||
555 | |||
556 | /* Like g_utf8_get_char, but take a maximum length |
||
557 | * and return (gunichar)-2 on incomplete trailing character; |
||
558 | * also check for malformed or overlong sequences |
||
559 | * and return (gunichar)-1 in this case. |
||
560 | */ |
||
561 | static inline gunichar |
||
562 | g_utf8_get_char_extended (const gchar *p, |
||
563 | gssize max_len) |
||
564 | { |
||
565 | guint i, len; |
||
566 | gunichar min_code; |
||
567 | gunichar wc = (guchar) *p; |
||
568 | |||
569 | if (wc < 0x80) |
||
570 | { |
||
571 | return wc; |
||
572 | } |
||
573 | else if (G_UNLIKELY (wc < 0xc0)) |
||
574 | { |
||
575 | return (gunichar)-1; |
||
576 | } |
||
577 | else if (wc < 0xe0) |
||
578 | { |
||
579 | len = 2; |
||
580 | wc &= 0x1f; |
||
581 | min_code = 1 << 7; |
||
582 | } |
||
583 | else if (wc < 0xf0) |
||
584 | { |
||
585 | len = 3; |
||
586 | wc &= 0x0f; |
||
587 | min_code = 1 << 11; |
||
588 | } |
||
589 | else if (wc < 0xf8) |
||
590 | { |
||
591 | len = 4; |
||
592 | wc &= 0x07; |
||
593 | min_code = 1 << 16; |
||
594 | } |
||
595 | else if (wc < 0xfc) |
||
596 | { |
||
597 | len = 5; |
||
598 | wc &= 0x03; |
||
599 | min_code = 1 << 21; |
||
600 | } |
||
601 | else if (wc < 0xfe) |
||
602 | { |
||
603 | len = 6; |
||
604 | wc &= 0x01; |
||
605 | min_code = 1 << 26; |
||
606 | } |
||
607 | else |
||
608 | { |
||
609 | return (gunichar)-1; |
||
610 | } |
||
611 | |||
612 | if (G_UNLIKELY (max_len >= 0 && len > max_len)) |
||
613 | { |
||
614 | for (i = 1; i < max_len; i++) |
||
615 | { |
||
616 | if ((((guchar *)p)[i] & 0xc0) != 0x80) |
||
617 | return (gunichar)-1; |
||
618 | } |
||
619 | return (gunichar)-2; |
||
620 | } |
||
621 | |||
622 | for (i = 1; i < len; ++i) |
||
623 | { |
||
624 | gunichar ch = ((guchar *)p)[i]; |
||
625 | |||
626 | if (G_UNLIKELY ((ch & 0xc0) != 0x80)) |
||
627 | { |
||
628 | if (ch) |
||
629 | return (gunichar)-1; |
||
630 | else |
||
631 | return (gunichar)-2; |
||
632 | } |
||
633 | |||
634 | wc <<= 6; |
||
635 | wc |= (ch & 0x3f); |
||
636 | } |
||
637 | |||
638 | if (G_UNLIKELY (wc < min_code)) |
||
639 | return (gunichar)-1; |
||
640 | |||
641 | return wc; |
||
642 | } |
||
643 | |||
644 | /** |
||
645 | * g_utf8_get_char_validated: |
||
646 | * @p: a pointer to Unicode character encoded as UTF-8 |
||
647 | * @max_len: the maximum number of bytes to read, or -1, for no maximum or |
||
648 | * if @p is nul-terminated |
||
649 | * |
||
650 | * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. |
||
651 | * This function checks for incomplete characters, for invalid characters |
||
652 | * such as characters that are out of the range of Unicode, and for |
||
653 | * overlong encodings of valid characters. |
||
654 | * |
||
655 | * Returns: the resulting character. If @p points to a partial |
||
656 | * sequence at the end of a string that could begin a valid |
||
657 | * character (or if @max_len is zero), returns (gunichar)-2; |
||
658 | * otherwise, if @p does not point to a valid UTF-8 encoded |
||
659 | * Unicode character, returns (gunichar)-1. |
||
660 | */ |
||
661 | gunichar |
||
662 | g_utf8_get_char_validated (const gchar *p, |
||
663 | gssize max_len) |
||
664 | { |
||
665 | gunichar result; |
||
666 | |||
667 | if (max_len == 0) |
||
668 | return (gunichar)-2; |
||
669 | |||
670 | result = g_utf8_get_char_extended (p, max_len); |
||
671 | |||
672 | if (result & 0x80000000) |
||
673 | return result; |
||
674 | else if (!UNICODE_VALID (result)) |
||
675 | return (gunichar)-1; |
||
676 | else |
||
677 | return result; |
||
678 | } |
||
679 | |||
680 | #define CONT_BYTE_FAST(p) ((guchar)*p++ & 0x3f) |
||
681 | |||
682 | /** |
||
683 | * g_utf8_to_ucs4_fast: |
||
684 | * @str: a UTF-8 encoded string |
||
685 | * @len: the maximum length of @str to use, in bytes. If @len < 0, |
||
686 | * then the string is nul-terminated. |
||
687 | * @items_written: (out caller-allocates) (optional): location to store the |
||
688 | * number of characters in the result, or %NULL. |
||
689 | * |
||
690 | * Convert a string from UTF-8 to a 32-bit fixed width |
||
691 | * representation as UCS-4, assuming valid UTF-8 input. |
||
692 | * This function is roughly twice as fast as g_utf8_to_ucs4() |
||
693 | * but does no error checking on the input. A trailing 0 character |
||
694 | * will be added to the string after the converted text. |
||
695 | * |
||
696 | * Returns: a pointer to a newly allocated UCS-4 string. |
||
697 | * This value must be freed with g_free(). |
||
698 | */ |
||
699 | gunichar * |
||
700 | g_utf8_to_ucs4_fast (const gchar *str, |
||
701 | glong len, |
||
702 | glong *items_written) |
||
703 | { |
||
704 | gunichar *result; |
||
705 | gint n_chars, i; |
||
706 | const gchar *p; |
||
707 | |||
708 | g_return_val_if_fail (str != NULL, NULL); |
||
709 | |||
710 | p = str; |
||
711 | n_chars = 0; |
||
712 | if (len < 0) |
||
713 | { |
||
714 | while (*p) |
||
715 | { |
||
716 | p = g_utf8_next_char (p); |
||
717 | ++n_chars; |
||
718 | } |
||
719 | } |
||
720 | else |
||
721 | { |
||
722 | while (p < str + len && *p) |
||
723 | { |
||
724 | p = g_utf8_next_char (p); |
||
725 | ++n_chars; |
||
726 | } |
||
727 | } |
||
728 | |||
729 | result = g_new (gunichar, n_chars + 1); |
||
730 | |||
731 | p = str; |
||
732 | for (i=0; i < n_chars; i++) |
||
733 | { |
||
734 | guchar first = (guchar)*p++; |
||
735 | gunichar wc; |
||
736 | |||
737 | if (first < 0xc0) |
||
738 | { |
||
739 | /* We really hope first < 0x80, but we don't want to test an |
||
740 | * extra branch for invalid input, which this function |
||
741 | * does not care about. Handling unexpected continuation bytes |
||
742 | * here will do the least damage. */ |
||
743 | wc = first; |
||
744 | } |
||
745 | else |
||
746 | { |
||
747 | gunichar c1 = CONT_BYTE_FAST(p); |
||
748 | if (first < 0xe0) |
||
749 | { |
||
750 | wc = ((first & 0x1f) << 6) | c1; |
||
751 | } |
||
752 | else |
||
753 | { |
||
754 | gunichar c2 = CONT_BYTE_FAST(p); |
||
755 | if (first < 0xf0) |
||
756 | { |
||
757 | wc = ((first & 0x0f) << 12) | (c1 << 6) | c2; |
||
758 | } |
||
759 | else |
||
760 | { |
||
761 | gunichar c3 = CONT_BYTE_FAST(p); |
||
762 | wc = ((first & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3; |
||
763 | if (G_UNLIKELY (first >= 0xf8)) |
||
764 | { |
||
765 | /* This can't be valid UTF-8, but g_utf8_next_char() |
||
766 | * and company allow out-of-range sequences */ |
||
767 | gunichar mask = 1 << 20; |
||
768 | while ((wc & mask) != 0) |
||
769 | { |
||
770 | wc <<= 6; |
||
771 | wc |= CONT_BYTE_FAST(p); |
||
772 | mask <<= 5; |
||
773 | } |
||
774 | wc &= mask - 1; |
||
775 | } |
||
776 | } |
||
777 | } |
||
778 | } |
||
779 | result[i] = wc; |
||
780 | } |
||
781 | result[i] = 0; |
||
782 | |||
783 | if (items_written) |
||
784 | *items_written = i; |
||
785 | |||
786 | return result; |
||
787 | } |
||
788 | |||
789 | static gpointer |
||
790 | try_malloc_n (gsize n_blocks, gsize n_block_bytes, GError **error) |
||
791 | { |
||
792 | gpointer ptr = g_try_malloc_n (n_blocks, n_block_bytes); |
||
793 | if (ptr == NULL) |
||
794 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, |
||
795 | _("Failed to allocate memory")); |
||
796 | return ptr; |
||
797 | } |
||
798 | |||
799 | /** |
||
800 | * g_utf8_to_ucs4: |
||
801 | * @str: a UTF-8 encoded string |
||
802 | * @len: the maximum length of @str to use, in bytes. If @len < 0, |
||
803 | * then the string is nul-terminated. |
||
804 | * @items_read: (out caller-allocates) (optional): location to store number of |
||
805 | * bytes read, or %NULL. |
||
806 | * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be |
||
807 | * returned in case @str contains a trailing partial |
||
808 | * character. If an error occurs then the index of the |
||
809 | * invalid input is stored here. |
||
810 | * @items_written: (out caller-allocates) (optional): location to store number |
||
811 | * of characters written or %NULL. The value here stored does not include |
||
812 | * the trailing 0 character. |
||
813 | * @error: location to store the error occurring, or %NULL to ignore |
||
814 | * errors. Any of the errors in #GConvertError other than |
||
815 | * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
||
816 | * |
||
817 | * Convert a string from UTF-8 to a 32-bit fixed width |
||
818 | * representation as UCS-4. A trailing 0 character will be added to the |
||
819 | * string after the converted text. |
||
820 | * |
||
821 | * Returns: a pointer to a newly allocated UCS-4 string. |
||
822 | * This value must be freed with g_free(). If an error occurs, |
||
823 | * %NULL will be returned and @error set. |
||
824 | */ |
||
825 | gunichar * |
||
826 | g_utf8_to_ucs4 (const gchar *str, |
||
827 | glong len, |
||
828 | glong *items_read, |
||
829 | glong *items_written, |
||
830 | GError **error) |
||
831 | { |
||
832 | gunichar *result = NULL; |
||
833 | gint n_chars, i; |
||
834 | const gchar *in; |
||
835 | |||
836 | in = str; |
||
837 | n_chars = 0; |
||
838 | while ((len < 0 || str + len - in > 0) && *in) |
||
839 | { |
||
840 | gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in); |
||
841 | if (wc & 0x80000000) |
||
842 | { |
||
843 | if (wc == (gunichar)-2) |
||
844 | { |
||
845 | if (items_read) |
||
846 | break; |
||
847 | else |
||
848 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
||
849 | _("Partial character sequence at end of input")); |
||
850 | } |
||
851 | else |
||
852 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
853 | _("Invalid byte sequence in conversion input")); |
||
854 | |||
855 | goto err_out; |
||
856 | } |
||
857 | |||
858 | n_chars++; |
||
859 | |||
860 | in = g_utf8_next_char (in); |
||
861 | } |
||
862 | |||
863 | result = try_malloc_n (n_chars + 1, sizeof (gunichar), error); |
||
864 | if (result == NULL) |
||
865 | goto err_out; |
||
866 | |||
867 | in = str; |
||
868 | for (i=0; i < n_chars; i++) |
||
869 | { |
||
870 | result[i] = g_utf8_get_char (in); |
||
871 | in = g_utf8_next_char (in); |
||
872 | } |
||
873 | result[i] = 0; |
||
874 | |||
875 | if (items_written) |
||
876 | *items_written = n_chars; |
||
877 | |||
878 | err_out: |
||
879 | if (items_read) |
||
880 | *items_read = in - str; |
||
881 | |||
882 | return result; |
||
883 | } |
||
884 | |||
885 | /** |
||
886 | * g_ucs4_to_utf8: |
||
887 | * @str: a UCS-4 encoded string |
||
888 | * @len: the maximum length (number of characters) of @str to use. |
||
889 | * If @len < 0, then the string is nul-terminated. |
||
890 | * @items_read: (out caller-allocates) (optional): location to store number of |
||
891 | * characters read, or %NULL. |
||
892 | * @items_written: (out caller-allocates) (optional): location to store number |
||
893 | * of bytes written or %NULL. The value here stored does not include the |
||
894 | * trailing 0 byte. |
||
895 | * @error: location to store the error occurring, or %NULL to ignore |
||
896 | * errors. Any of the errors in #GConvertError other than |
||
897 | * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
||
898 | * |
||
899 | * Convert a string from a 32-bit fixed width representation as UCS-4. |
||
900 | * to UTF-8. The result will be terminated with a 0 byte. |
||
901 | * |
||
902 | * Returns: a pointer to a newly allocated UTF-8 string. |
||
903 | * This value must be freed with g_free(). If an error occurs, |
||
904 | * %NULL will be returned and @error set. In that case, @items_read |
||
905 | * will be set to the position of the first invalid input character. |
||
906 | */ |
||
907 | gchar * |
||
908 | g_ucs4_to_utf8 (const gunichar *str, |
||
909 | glong len, |
||
910 | glong *items_read, |
||
911 | glong *items_written, |
||
912 | GError **error) |
||
913 | { |
||
914 | gint result_length; |
||
915 | gchar *result = NULL; |
||
916 | gchar *p; |
||
917 | gint i; |
||
918 | |||
919 | result_length = 0; |
||
920 | for (i = 0; len < 0 || i < len ; i++) |
||
921 | { |
||
922 | if (!str[i]) |
||
923 | break; |
||
924 | |||
925 | if (str[i] >= 0x80000000) |
||
926 | { |
||
927 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
928 | _("Character out of range for UTF-8")); |
||
929 | goto err_out; |
||
930 | } |
||
931 | |||
932 | result_length += UTF8_LENGTH (str[i]); |
||
933 | } |
||
934 | |||
935 | result = try_malloc_n (result_length + 1, 1, error); |
||
936 | if (result == NULL) |
||
937 | goto err_out; |
||
938 | |||
939 | p = result; |
||
940 | |||
941 | i = 0; |
||
942 | while (p < result + result_length) |
||
943 | p += g_unichar_to_utf8 (str[i++], p); |
||
944 | |||
945 | *p = '\0'; |
||
946 | |||
947 | if (items_written) |
||
948 | *items_written = p - result; |
||
949 | |||
950 | err_out: |
||
951 | if (items_read) |
||
952 | *items_read = i; |
||
953 | |||
954 | return result; |
||
955 | } |
||
956 | |||
957 | #define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000) |
||
958 | |||
959 | /** |
||
960 | * g_utf16_to_utf8: |
||
961 | * @str: a UTF-16 encoded string |
||
962 | * @len: the maximum length (number of #gunichar2) of @str to use. |
||
963 | * If @len < 0, then the string is nul-terminated. |
||
964 | * @items_read: (out caller-allocates) (optional): location to store number of |
||
965 | * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will |
||
966 | * be returned in case @str contains a trailing partial character. If |
||
967 | * an error occurs then the index of the invalid input is stored here. |
||
968 | * @items_written: (out caller-allocates) (optional): location to store number |
||
969 | * of bytes written, or %NULL. The value stored here does not include the |
||
970 | * trailing 0 byte. |
||
971 | * @error: location to store the error occurring, or %NULL to ignore |
||
972 | * errors. Any of the errors in #GConvertError other than |
||
973 | * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
||
974 | * |
||
975 | * Convert a string from UTF-16 to UTF-8. The result will be |
||
976 | * terminated with a 0 byte. |
||
977 | * |
||
978 | * Note that the input is expected to be already in native endianness, |
||
979 | * an initial byte-order-mark character is not handled specially. |
||
980 | * g_convert() can be used to convert a byte buffer of UTF-16 data of |
||
981 | * ambiguous endianess. |
||
982 | * |
||
983 | * Further note that this function does not validate the result |
||
984 | * string; it may e.g. include embedded NUL characters. The only |
||
985 | * validation done by this function is to ensure that the input can |
||
986 | * be correctly interpreted as UTF-16, i.e. it doesn't contain |
||
987 | * things unpaired surrogates. |
||
988 | * |
||
989 | * Returns: a pointer to a newly allocated UTF-8 string. |
||
990 | * This value must be freed with g_free(). If an error occurs, |
||
991 | * %NULL will be returned and @error set. |
||
992 | **/ |
||
993 | gchar * |
||
994 | g_utf16_to_utf8 (const gunichar2 *str, |
||
995 | glong len, |
||
996 | glong *items_read, |
||
997 | glong *items_written, |
||
998 | GError **error) |
||
999 | { |
||
1000 | /* This function and g_utf16_to_ucs4 are almost exactly identical - |
||
1001 | * The lines that differ are marked. |
||
1002 | */ |
||
1003 | const gunichar2 *in; |
||
1004 | gchar *out; |
||
1005 | gchar *result = NULL; |
||
1006 | gint n_bytes; |
||
1007 | gunichar high_surrogate; |
||
1008 | |||
1009 | g_return_val_if_fail (str != NULL, NULL); |
||
1010 | |||
1011 | n_bytes = 0; |
||
1012 | in = str; |
||
1013 | high_surrogate = 0; |
||
1014 | while ((len < 0 || in - str < len) && *in) |
||
1015 | { |
||
1016 | gunichar2 c = *in; |
||
1017 | gunichar wc; |
||
1018 | |||
1019 | if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ |
||
1020 | { |
||
1021 | if (high_surrogate) |
||
1022 | { |
||
1023 | wc = SURROGATE_VALUE (high_surrogate, c); |
||
1024 | high_surrogate = 0; |
||
1025 | } |
||
1026 | else |
||
1027 | { |
||
1028 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
1029 | _("Invalid sequence in conversion input")); |
||
1030 | goto err_out; |
||
1031 | } |
||
1032 | } |
||
1033 | else |
||
1034 | { |
||
1035 | if (high_surrogate) |
||
1036 | { |
||
1037 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
1038 | _("Invalid sequence in conversion input")); |
||
1039 | goto err_out; |
||
1040 | } |
||
1041 | |||
1042 | if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ |
||
1043 | { |
||
1044 | high_surrogate = c; |
||
1045 | goto next1; |
||
1046 | } |
||
1047 | else |
||
1048 | wc = c; |
||
1049 | } |
||
1050 | |||
1051 | /********** DIFFERENT for UTF8/UCS4 **********/ |
||
1052 | n_bytes += UTF8_LENGTH (wc); |
||
1053 | |||
1054 | next1: |
||
1055 | in++; |
||
1056 | } |
||
1057 | |||
1058 | if (high_surrogate && !items_read) |
||
1059 | { |
||
1060 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
||
1061 | _("Partial character sequence at end of input")); |
||
1062 | goto err_out; |
||
1063 | } |
||
1064 | |||
1065 | /* At this point, everything is valid, and we just need to convert |
||
1066 | */ |
||
1067 | /********** DIFFERENT for UTF8/UCS4 **********/ |
||
1068 | result = try_malloc_n (n_bytes + 1, 1, error); |
||
1069 | if (result == NULL) |
||
1070 | goto err_out; |
||
1071 | |||
1072 | high_surrogate = 0; |
||
1073 | out = result; |
||
1074 | in = str; |
||
1075 | while (out < result + n_bytes) |
||
1076 | { |
||
1077 | gunichar2 c = *in; |
||
1078 | gunichar wc; |
||
1079 | |||
1080 | if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ |
||
1081 | { |
||
1082 | wc = SURROGATE_VALUE (high_surrogate, c); |
||
1083 | high_surrogate = 0; |
||
1084 | } |
||
1085 | else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ |
||
1086 | { |
||
1087 | high_surrogate = c; |
||
1088 | goto next2; |
||
1089 | } |
||
1090 | else |
||
1091 | wc = c; |
||
1092 | |||
1093 | /********** DIFFERENT for UTF8/UCS4 **********/ |
||
1094 | out += g_unichar_to_utf8 (wc, out); |
||
1095 | |||
1096 | next2: |
||
1097 | in++; |
||
1098 | } |
||
1099 | |||
1100 | /********** DIFFERENT for UTF8/UCS4 **********/ |
||
1101 | *out = '\0'; |
||
1102 | |||
1103 | if (items_written) |
||
1104 | /********** DIFFERENT for UTF8/UCS4 **********/ |
||
1105 | *items_written = out - result; |
||
1106 | |||
1107 | err_out: |
||
1108 | if (items_read) |
||
1109 | *items_read = in - str; |
||
1110 | |||
1111 | return result; |
||
1112 | } |
||
1113 | |||
1114 | /** |
||
1115 | * g_utf16_to_ucs4: |
||
1116 | * @str: a UTF-16 encoded string |
||
1117 | * @len: the maximum length (number of #gunichar2) of @str to use. |
||
1118 | * If @len < 0, then the string is nul-terminated. |
||
1119 | * @items_read: (out caller-allocates) (optional): location to store number of |
||
1120 | * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will |
||
1121 | * be returned in case @str contains a trailing partial character. If |
||
1122 | * an error occurs then the index of the invalid input is stored here. |
||
1123 | * @items_written: (out caller-allocates) (optional): location to store number |
||
1124 | * of characters written, or %NULL. The value stored here does not include |
||
1125 | * the trailing 0 character. |
||
1126 | * @error: location to store the error occurring, or %NULL to ignore |
||
1127 | * errors. Any of the errors in #GConvertError other than |
||
1128 | * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
||
1129 | * |
||
1130 | * Convert a string from UTF-16 to UCS-4. The result will be |
||
1131 | * nul-terminated. |
||
1132 | * |
||
1133 | * Returns: a pointer to a newly allocated UCS-4 string. |
||
1134 | * This value must be freed with g_free(). If an error occurs, |
||
1135 | * %NULL will be returned and @error set. |
||
1136 | */ |
||
1137 | gunichar * |
||
1138 | g_utf16_to_ucs4 (const gunichar2 *str, |
||
1139 | glong len, |
||
1140 | glong *items_read, |
||
1141 | glong *items_written, |
||
1142 | GError **error) |
||
1143 | { |
||
1144 | const gunichar2 *in; |
||
1145 | gchar *out; |
||
1146 | gchar *result = NULL; |
||
1147 | gint n_bytes; |
||
1148 | gunichar high_surrogate; |
||
1149 | |||
1150 | g_return_val_if_fail (str != NULL, NULL); |
||
1151 | |||
1152 | n_bytes = 0; |
||
1153 | in = str; |
||
1154 | high_surrogate = 0; |
||
1155 | while ((len < 0 || in - str < len) && *in) |
||
1156 | { |
||
1157 | gunichar2 c = *in; |
||
1158 | |||
1159 | if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ |
||
1160 | { |
||
1161 | if (high_surrogate) |
||
1162 | { |
||
1163 | high_surrogate = 0; |
||
1164 | } |
||
1165 | else |
||
1166 | { |
||
1167 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
1168 | _("Invalid sequence in conversion input")); |
||
1169 | goto err_out; |
||
1170 | } |
||
1171 | } |
||
1172 | else |
||
1173 | { |
||
1174 | if (high_surrogate) |
||
1175 | { |
||
1176 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
1177 | _("Invalid sequence in conversion input")); |
||
1178 | goto err_out; |
||
1179 | } |
||
1180 | |||
1181 | if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ |
||
1182 | { |
||
1183 | high_surrogate = c; |
||
1184 | goto next1; |
||
1185 | } |
||
1186 | } |
||
1187 | |||
1188 | /********** DIFFERENT for UTF8/UCS4 **********/ |
||
1189 | n_bytes += sizeof (gunichar); |
||
1190 | |||
1191 | next1: |
||
1192 | in++; |
||
1193 | } |
||
1194 | |||
1195 | if (high_surrogate && !items_read) |
||
1196 | { |
||
1197 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
||
1198 | _("Partial character sequence at end of input")); |
||
1199 | goto err_out; |
||
1200 | } |
||
1201 | |||
1202 | /* At this point, everything is valid, and we just need to convert |
||
1203 | */ |
||
1204 | /********** DIFFERENT for UTF8/UCS4 **********/ |
||
1205 | result = try_malloc_n (n_bytes + 4, 1, error); |
||
1206 | if (result == NULL) |
||
1207 | goto err_out; |
||
1208 | |||
1209 | high_surrogate = 0; |
||
1210 | out = result; |
||
1211 | in = str; |
||
1212 | while (out < result + n_bytes) |
||
1213 | { |
||
1214 | gunichar2 c = *in; |
||
1215 | gunichar wc; |
||
1216 | |||
1217 | if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ |
||
1218 | { |
||
1219 | wc = SURROGATE_VALUE (high_surrogate, c); |
||
1220 | high_surrogate = 0; |
||
1221 | } |
||
1222 | else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ |
||
1223 | { |
||
1224 | high_surrogate = c; |
||
1225 | goto next2; |
||
1226 | } |
||
1227 | else |
||
1228 | wc = c; |
||
1229 | |||
1230 | /********** DIFFERENT for UTF8/UCS4 **********/ |
||
1231 | *(gunichar *)out = wc; |
||
1232 | out += sizeof (gunichar); |
||
1233 | |||
1234 | next2: |
||
1235 | in++; |
||
1236 | } |
||
1237 | |||
1238 | /********** DIFFERENT for UTF8/UCS4 **********/ |
||
1239 | *(gunichar *)out = 0; |
||
1240 | |||
1241 | if (items_written) |
||
1242 | /********** DIFFERENT for UTF8/UCS4 **********/ |
||
1243 | *items_written = (out - result) / sizeof (gunichar); |
||
1244 | |||
1245 | err_out: |
||
1246 | if (items_read) |
||
1247 | *items_read = in - str; |
||
1248 | |||
1249 | return (gunichar *)result; |
||
1250 | } |
||
1251 | |||
1252 | /** |
||
1253 | * g_utf8_to_utf16: |
||
1254 | * @str: a UTF-8 encoded string |
||
1255 | * @len: the maximum length (number of bytes) of @str to use. |
||
1256 | * If @len < 0, then the string is nul-terminated. |
||
1257 | * @items_read: (out caller-allocates) (optional): location to store number of |
||
1258 | * bytes read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will |
||
1259 | * be returned in case @str contains a trailing partial character. If |
||
1260 | * an error occurs then the index of the invalid input is stored here. |
||
1261 | * @items_written: (out caller-allocates) (optional): location to store number |
||
1262 | * of #gunichar2 written, or %NULL. The value stored here does not include |
||
1263 | * the trailing 0. |
||
1264 | * @error: location to store the error occurring, or %NULL to ignore |
||
1265 | * errors. Any of the errors in #GConvertError other than |
||
1266 | * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
||
1267 | * |
||
1268 | * Convert a string from UTF-8 to UTF-16. A 0 character will be |
||
1269 | * added to the result after the converted text. |
||
1270 | * |
||
1271 | * Returns: a pointer to a newly allocated UTF-16 string. |
||
1272 | * This value must be freed with g_free(). If an error occurs, |
||
1273 | * %NULL will be returned and @error set. |
||
1274 | */ |
||
1275 | gunichar2 * |
||
1276 | g_utf8_to_utf16 (const gchar *str, |
||
1277 | glong len, |
||
1278 | glong *items_read, |
||
1279 | glong *items_written, |
||
1280 | GError **error) |
||
1281 | { |
||
1282 | gunichar2 *result = NULL; |
||
1283 | gint n16; |
||
1284 | const gchar *in; |
||
1285 | gint i; |
||
1286 | |||
1287 | g_return_val_if_fail (str != NULL, NULL); |
||
1288 | |||
1289 | in = str; |
||
1290 | n16 = 0; |
||
1291 | while ((len < 0 || str + len - in > 0) && *in) |
||
1292 | { |
||
1293 | gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in); |
||
1294 | if (wc & 0x80000000) |
||
1295 | { |
||
1296 | if (wc == (gunichar)-2) |
||
1297 | { |
||
1298 | if (items_read) |
||
1299 | break; |
||
1300 | else |
||
1301 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
||
1302 | _("Partial character sequence at end of input")); |
||
1303 | } |
||
1304 | else |
||
1305 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
1306 | _("Invalid byte sequence in conversion input")); |
||
1307 | |||
1308 | goto err_out; |
||
1309 | } |
||
1310 | |||
1311 | if (wc < 0xd800) |
||
1312 | n16 += 1; |
||
1313 | else if (wc < 0xe000) |
||
1314 | { |
||
1315 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
1316 | _("Invalid sequence in conversion input")); |
||
1317 | |||
1318 | goto err_out; |
||
1319 | } |
||
1320 | else if (wc < 0x10000) |
||
1321 | n16 += 1; |
||
1322 | else if (wc < 0x110000) |
||
1323 | n16 += 2; |
||
1324 | else |
||
1325 | { |
||
1326 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
1327 | _("Character out of range for UTF-16")); |
||
1328 | |||
1329 | goto err_out; |
||
1330 | } |
||
1331 | |||
1332 | in = g_utf8_next_char (in); |
||
1333 | } |
||
1334 | |||
1335 | result = try_malloc_n (n16 + 1, sizeof (gunichar2), error); |
||
1336 | if (result == NULL) |
||
1337 | goto err_out; |
||
1338 | |||
1339 | in = str; |
||
1340 | for (i = 0; i < n16;) |
||
1341 | { |
||
1342 | gunichar wc = g_utf8_get_char (in); |
||
1343 | |||
1344 | if (wc < 0x10000) |
||
1345 | { |
||
1346 | result[i++] = wc; |
||
1347 | } |
||
1348 | else |
||
1349 | { |
||
1350 | result[i++] = (wc - 0x10000) / 0x400 + 0xd800; |
||
1351 | result[i++] = (wc - 0x10000) % 0x400 + 0xdc00; |
||
1352 | } |
||
1353 | |||
1354 | in = g_utf8_next_char (in); |
||
1355 | } |
||
1356 | |||
1357 | result[i] = 0; |
||
1358 | |||
1359 | if (items_written) |
||
1360 | *items_written = n16; |
||
1361 | |||
1362 | err_out: |
||
1363 | if (items_read) |
||
1364 | *items_read = in - str; |
||
1365 | |||
1366 | return result; |
||
1367 | } |
||
1368 | |||
1369 | /** |
||
1370 | * g_ucs4_to_utf16: |
||
1371 | * @str: a UCS-4 encoded string |
||
1372 | * @len: the maximum length (number of characters) of @str to use. |
||
1373 | * If @len < 0, then the string is nul-terminated. |
||
1374 | * @items_read: (out caller-allocates) (optional): location to store number of |
||
1375 | * bytes read, or %NULL. If an error occurs then the index of the invalid |
||
1376 | * input is stored here. |
||
1377 | * @items_written: (out caller-allocates) (optional): location to store number |
||
1378 | * of #gunichar2 written, or %NULL. The value stored here does not include |
||
1379 | * the trailing 0. |
||
1380 | * @error: location to store the error occurring, or %NULL to ignore |
||
1381 | * errors. Any of the errors in #GConvertError other than |
||
1382 | * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
||
1383 | * |
||
1384 | * Convert a string from UCS-4 to UTF-16. A 0 character will be |
||
1385 | * added to the result after the converted text. |
||
1386 | * |
||
1387 | * Returns: a pointer to a newly allocated UTF-16 string. |
||
1388 | * This value must be freed with g_free(). If an error occurs, |
||
1389 | * %NULL will be returned and @error set. |
||
1390 | */ |
||
1391 | gunichar2 * |
||
1392 | g_ucs4_to_utf16 (const gunichar *str, |
||
1393 | glong len, |
||
1394 | glong *items_read, |
||
1395 | glong *items_written, |
||
1396 | GError **error) |
||
1397 | { |
||
1398 | gunichar2 *result = NULL; |
||
1399 | gint n16; |
||
1400 | gint i, j; |
||
1401 | |||
1402 | n16 = 0; |
||
1403 | i = 0; |
||
1404 | while ((len < 0 || i < len) && str[i]) |
||
1405 | { |
||
1406 | gunichar wc = str[i]; |
||
1407 | |||
1408 | if (wc < 0xd800) |
||
1409 | n16 += 1; |
||
1410 | else if (wc < 0xe000) |
||
1411 | { |
||
1412 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
1413 | _("Invalid sequence in conversion input")); |
||
1414 | |||
1415 | goto err_out; |
||
1416 | } |
||
1417 | else if (wc < 0x10000) |
||
1418 | n16 += 1; |
||
1419 | else if (wc < 0x110000) |
||
1420 | n16 += 2; |
||
1421 | else |
||
1422 | { |
||
1423 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
1424 | _("Character out of range for UTF-16")); |
||
1425 | |||
1426 | goto err_out; |
||
1427 | } |
||
1428 | |||
1429 | i++; |
||
1430 | } |
||
1431 | |||
1432 | result = try_malloc_n (n16 + 1, sizeof (gunichar2), error); |
||
1433 | if (result == NULL) |
||
1434 | goto err_out; |
||
1435 | |||
1436 | for (i = 0, j = 0; j < n16; i++) |
||
1437 | { |
||
1438 | gunichar wc = str[i]; |
||
1439 | |||
1440 | if (wc < 0x10000) |
||
1441 | { |
||
1442 | result[j++] = wc; |
||
1443 | } |
||
1444 | else |
||
1445 | { |
||
1446 | result[j++] = (wc - 0x10000) / 0x400 + 0xd800; |
||
1447 | result[j++] = (wc - 0x10000) % 0x400 + 0xdc00; |
||
1448 | } |
||
1449 | } |
||
1450 | result[j] = 0; |
||
1451 | |||
1452 | if (items_written) |
||
1453 | *items_written = n16; |
||
1454 | |||
1455 | err_out: |
||
1456 | if (items_read) |
||
1457 | *items_read = i; |
||
1458 | |||
1459 | return result; |
||
1460 | } |
||
1461 | |||
1462 | #define VALIDATE_BYTE(mask, expect) \ |
||
1463 | G_STMT_START { \ |
||
1464 | if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \ |
||
1465 | goto error; \ |
||
1466 | } G_STMT_END |
||
1467 | |||
1468 | /* see IETF RFC 3629 Section 4 */ |
||
1469 | |||
1470 | static const gchar * |
||
1471 | fast_validate (const char *str) |
||
1472 | |||
1473 | { |
||
1474 | const gchar *p; |
||
1475 | |||
1476 | for (p = str; *p; p++) |
||
1477 | { |
||
1478 | if (*(guchar *)p < 128) |
||
1479 | /* done */; |
||
1480 | else |
||
1481 | { |
||
1482 | const gchar *last; |
||
1483 | |||
1484 | last = p; |
||
1485 | if (*(guchar *)p < 0xe0) /* 110xxxxx */ |
||
1486 | { |
||
1487 | if (G_UNLIKELY (*(guchar *)p < 0xc2)) |
||
1488 | goto error; |
||
1489 | } |
||
1490 | else |
||
1491 | { |
||
1492 | if (*(guchar *)p < 0xf0) /* 1110xxxx */ |
||
1493 | { |
||
1494 | switch (*(guchar *)p++ & 0x0f) |
||
1495 | { |
||
1496 | case 0: |
||
1497 | VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */ |
||
1498 | break; |
||
1499 | case 0x0d: |
||
1500 | VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */ |
||
1501 | break; |
||
1502 | default: |
||
1503 | VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ |
||
1504 | } |
||
1505 | } |
||
1506 | else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */ |
||
1507 | { |
||
1508 | switch (*(guchar *)p++ & 0x07) |
||
1509 | { |
||
1510 | case 0: |
||
1511 | VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ |
||
1512 | if (G_UNLIKELY((*(guchar *)p & 0x30) == 0)) |
||
1513 | goto error; |
||
1514 | break; |
||
1515 | case 4: |
||
1516 | VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */ |
||
1517 | break; |
||
1518 | default: |
||
1519 | VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ |
||
1520 | } |
||
1521 | p++; |
||
1522 | VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ |
||
1523 | } |
||
1524 | else |
||
1525 | goto error; |
||
1526 | } |
||
1527 | |||
1528 | p++; |
||
1529 | VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ |
||
1530 | |||
1531 | continue; |
||
1532 | |||
1533 | error: |
||
1534 | return last; |
||
1535 | } |
||
1536 | } |
||
1537 | |||
1538 | return p; |
||
1539 | } |
||
1540 | |||
1541 | static const gchar * |
||
1542 | fast_validate_len (const char *str, |
||
1543 | gssize max_len) |
||
1544 | |||
1545 | { |
||
1546 | const gchar *p; |
||
1547 | |||
1548 | g_assert (max_len >= 0); |
||
1549 | |||
1550 | for (p = str; ((p - str) < max_len) && *p; p++) |
||
1551 | { |
||
1552 | if (*(guchar *)p < 128) |
||
1553 | /* done */; |
||
1554 | else |
||
1555 | { |
||
1556 | const gchar *last; |
||
1557 | |||
1558 | last = p; |
||
1559 | if (*(guchar *)p < 0xe0) /* 110xxxxx */ |
||
1560 | { |
||
1561 | if (G_UNLIKELY (max_len - (p - str) < 2)) |
||
1562 | goto error; |
||
1563 | |||
1564 | if (G_UNLIKELY (*(guchar *)p < 0xc2)) |
||
1565 | goto error; |
||
1566 | } |
||
1567 | else |
||
1568 | { |
||
1569 | if (*(guchar *)p < 0xf0) /* 1110xxxx */ |
||
1570 | { |
||
1571 | if (G_UNLIKELY (max_len - (p - str) < 3)) |
||
1572 | goto error; |
||
1573 | |||
1574 | switch (*(guchar *)p++ & 0x0f) |
||
1575 | { |
||
1576 | case 0: |
||
1577 | VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */ |
||
1578 | break; |
||
1579 | case 0x0d: |
||
1580 | VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */ |
||
1581 | break; |
||
1582 | default: |
||
1583 | VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ |
||
1584 | } |
||
1585 | } |
||
1586 | else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */ |
||
1587 | { |
||
1588 | if (G_UNLIKELY (max_len - (p - str) < 4)) |
||
1589 | goto error; |
||
1590 | |||
1591 | switch (*(guchar *)p++ & 0x07) |
||
1592 | { |
||
1593 | case 0: |
||
1594 | VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ |
||
1595 | if (G_UNLIKELY((*(guchar *)p & 0x30) == 0)) |
||
1596 | goto error; |
||
1597 | break; |
||
1598 | case 4: |
||
1599 | VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */ |
||
1600 | break; |
||
1601 | default: |
||
1602 | VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ |
||
1603 | } |
||
1604 | p++; |
||
1605 | VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ |
||
1606 | } |
||
1607 | else |
||
1608 | goto error; |
||
1609 | } |
||
1610 | |||
1611 | p++; |
||
1612 | VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ |
||
1613 | |||
1614 | continue; |
||
1615 | |||
1616 | error: |
||
1617 | return last; |
||
1618 | } |
||
1619 | } |
||
1620 | |||
1621 | return p; |
||
1622 | } |
||
1623 | |||
1624 | /** |
||
1625 | * g_utf8_validate: |
||
1626 | * @str: (array length=max_len) (element-type guint8): a pointer to character data |
||
1627 | * @max_len: max bytes to validate, or -1 to go until NUL |
||
1628 | * @end: (allow-none) (out) (transfer none): return location for end of valid data |
||
1629 | * |
||
1630 | * Validates UTF-8 encoded text. @str is the text to validate; |
||
1631 | * if @str is nul-terminated, then @max_len can be -1, otherwise |
||
1632 | * @max_len should be the number of bytes to validate. |
||
1633 | * If @end is non-%NULL, then the end of the valid range |
||
1634 | * will be stored there (i.e. the start of the first invalid |
||
1635 | * character if some bytes were invalid, or the end of the text |
||
1636 | * being validated otherwise). |
||
1637 | * |
||
1638 | * Note that g_utf8_validate() returns %FALSE if @max_len is |
||
1639 | * positive and any of the @max_len bytes are nul. |
||
1640 | * |
||
1641 | * Returns %TRUE if all of @str was valid. Many GLib and GTK+ |
||
1642 | * routines require valid UTF-8 as input; so data read from a file |
||
1643 | * or the network should be checked with g_utf8_validate() before |
||
1644 | * doing anything else with it. |
||
1645 | * |
||
1646 | * Returns: %TRUE if the text was valid UTF-8 |
||
1647 | */ |
||
1648 | gboolean |
||
1649 | g_utf8_validate (const char *str, |
||
1650 | gssize max_len, |
||
1651 | const gchar **end) |
||
1652 | |||
1653 | { |
||
1654 | const gchar *p; |
||
1655 | |||
1656 | if (max_len < 0) |
||
1657 | p = fast_validate (str); |
||
1658 | else |
||
1659 | p = fast_validate_len (str, max_len); |
||
1660 | |||
1661 | if (end) |
||
1662 | *end = p; |
||
1663 | |||
1664 | if ((max_len >= 0 && p != str + max_len) || |
||
1665 | (max_len < 0 && *p != '\0')) |
||
1666 | return FALSE; |
||
1667 | else |
||
1668 | return TRUE; |
||
1669 | } |
||
1670 | |||
1671 | /** |
||
1672 | * g_unichar_validate: |
||
1673 | * @ch: a Unicode character |
||
1674 | * |
||
1675 | * Checks whether @ch is a valid Unicode character. Some possible |
||
1676 | * integer values of @ch will not be valid. 0 is considered a valid |
||
1677 | * character, though it's normally a string terminator. |
||
1678 | * |
||
1679 | * Returns: %TRUE if @ch is a valid Unicode character |
||
1680 | **/ |
||
1681 | gboolean |
||
1682 | g_unichar_validate (gunichar ch) |
||
1683 | { |
||
1684 | return UNICODE_VALID (ch); |
||
1685 | } |
||
1686 | |||
1687 | /** |
||
1688 | * g_utf8_strreverse: |
||
1689 | * @str: a UTF-8 encoded string |
||
1690 | * @len: the maximum length of @str to use, in bytes. If @len < 0, |
||
1691 | * then the string is nul-terminated. |
||
1692 | * |
||
1693 | * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text. |
||
1694 | * (Use g_utf8_validate() on all text before trying to use UTF-8 |
||
1695 | * utility functions with it.) |
||
1696 | * |
||
1697 | * This function is intended for programmatic uses of reversed strings. |
||
1698 | * It pays no attention to decomposed characters, combining marks, byte |
||
1699 | * order marks, directional indicators (LRM, LRO, etc) and similar |
||
1700 | * characters which might need special handling when reversing a string |
||
1701 | * for display purposes. |
||
1702 | * |
||
1703 | * Note that unlike g_strreverse(), this function returns |
||
1704 | * newly-allocated memory, which should be freed with g_free() when |
||
1705 | * no longer needed. |
||
1706 | * |
||
1707 | * Returns: a newly-allocated string which is the reverse of @str |
||
1708 | * |
||
1709 | * Since: 2.2 |
||
1710 | */ |
||
1711 | gchar * |
||
1712 | g_utf8_strreverse (const gchar *str, |
||
1713 | gssize len) |
||
1714 | { |
||
1715 | gchar *r, *result; |
||
1716 | const gchar *p; |
||
1717 | |||
1718 | if (len < 0) |
||
1719 | len = strlen (str); |
||
1720 | |||
1721 | result = g_new (gchar, len + 1); |
||
1722 | r = result + len; |
||
1723 | p = str; |
||
1724 | while (r > result) |
||
1725 | { |
||
1726 | gchar *m, skip = g_utf8_skip[*(guchar*) p]; |
||
1727 | r -= skip; |
||
1728 | for (m = r; skip; skip--) |
||
1729 | *m++ = *p++; |
||
1730 | } |
||
1731 | result[len] = 0; |
||
1732 | |||
1733 | return result; |
||
1734 | } |
||
1735 | |||
1736 | |||
1737 | gchar * |
||
1738 | _g_utf8_make_valid (const gchar *name) |
||
1739 | { |
||
1740 | GString *string; |
||
1741 | const gchar *remainder, *invalid; |
||
1742 | gint remaining_bytes, valid_bytes; |
||
1743 | |||
1744 | g_return_val_if_fail (name != NULL, NULL); |
||
1745 | |||
1746 | string = NULL; |
||
1747 | remainder = name; |
||
1748 | remaining_bytes = strlen (name); |
||
1749 | |||
1750 | while (remaining_bytes != 0) |
||
1751 | { |
||
1752 | if (g_utf8_validate (remainder, remaining_bytes, &invalid)) |
||
1753 | break; |
||
1754 | valid_bytes = invalid - remainder; |
||
1755 | |||
1756 | if (string == NULL) |
||
1757 | string = g_string_sized_new (remaining_bytes); |
||
1758 | |||
1759 | g_string_append_len (string, remainder, valid_bytes); |
||
1760 | /* append U+FFFD REPLACEMENT CHARACTER */ |
||
1761 | g_string_append (string, "\357\277\275"); |
||
1762 | |||
1763 | remaining_bytes -= valid_bytes + 1; |
||
1764 | remainder = invalid + 1; |
||
1765 | } |
||
1766 | |||
1767 | if (string == NULL) |
||
1768 | return g_strdup (name); |
||
1769 | |||
1770 | g_string_append (string, remainder); |
||
1771 | |||
1772 | g_assert (g_utf8_validate (string->str, -1, NULL)); |
||
1773 | |||
1774 | return g_string_free (string, FALSE); |
||
1775 | } |