nexmon – Blame information for rev 1
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
1 | office | 1 | /* guniprop.c - Unicode character properties. |
2 | * |
||
3 | * Copyright (C) 1999 Tom Tromey |
||
4 | * Copyright (C) 2000 Red Hat, Inc. |
||
5 | * |
||
6 | * This library is free software; you can redistribute it and/or |
||
7 | * modify it under the terms of the GNU Lesser General Public |
||
8 | * License as published by the Free Software Foundation; either |
||
9 | * version 2 of the License, or (at your option) any later version. |
||
10 | * |
||
11 | * This library is distributed in the hope that it will be useful, |
||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
14 | * Lesser General Public License for more details. |
||
15 | * |
||
16 | * You should have received a copy of the GNU Lesser General Public |
||
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
||
18 | */ |
||
19 | |||
20 | #include "config.h" |
||
21 | |||
22 | #include <stdlib.h> |
||
23 | #include <stddef.h> |
||
24 | #include <string.h> |
||
25 | #include <locale.h> |
||
26 | |||
27 | #include "gmem.h" |
||
28 | #include "gstring.h" |
||
29 | #include "gtestutils.h" |
||
30 | #include "gtypes.h" |
||
31 | #include "gunicode.h" |
||
32 | #include "gunichartables.h" |
||
33 | #include "gmirroringtable.h" |
||
34 | #include "gscripttable.h" |
||
35 | #include "gunicodeprivate.h" |
||
36 | #ifdef G_OS_WIN32 |
||
37 | #include "gwin32.h" |
||
38 | #endif |
||
39 | |||
40 | #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \ |
||
41 | ? attr_table_part1[Page] \ |
||
42 | : attr_table_part2[(Page) - 0xe00]) |
||
43 | |||
44 | #define ATTTABLE(Page, Char) \ |
||
45 | ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char])) |
||
46 | |||
47 | #define TTYPE_PART1(Page, Char) \ |
||
48 | ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
||
49 | ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
||
50 | : (type_data[type_table_part1[Page]][Char])) |
||
51 | |||
52 | #define TTYPE_PART2(Page, Char) \ |
||
53 | ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
||
54 | ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
||
55 | : (type_data[type_table_part2[Page]][Char])) |
||
56 | |||
57 | #define TYPE(Char) \ |
||
58 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
||
59 | ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \ |
||
60 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
||
61 | ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
||
62 | : G_UNICODE_UNASSIGNED)) |
||
63 | |||
64 | |||
65 | #define IS(Type, Class) (((guint)1 << (Type)) & (Class)) |
||
66 | #define OR(Type, Rest) (((guint)1 << (Type)) | (Rest)) |
||
67 | |||
68 | |||
69 | |||
70 | #define ISALPHA(Type) IS ((Type), \ |
||
71 | OR (G_UNICODE_LOWERCASE_LETTER, \ |
||
72 | OR (G_UNICODE_UPPERCASE_LETTER, \ |
||
73 | OR (G_UNICODE_TITLECASE_LETTER, \ |
||
74 | OR (G_UNICODE_MODIFIER_LETTER, \ |
||
75 | OR (G_UNICODE_OTHER_LETTER, 0)))))) |
||
76 | |||
77 | #define ISALDIGIT(Type) IS ((Type), \ |
||
78 | OR (G_UNICODE_DECIMAL_NUMBER, \ |
||
79 | OR (G_UNICODE_LETTER_NUMBER, \ |
||
80 | OR (G_UNICODE_OTHER_NUMBER, \ |
||
81 | OR (G_UNICODE_LOWERCASE_LETTER, \ |
||
82 | OR (G_UNICODE_UPPERCASE_LETTER, \ |
||
83 | OR (G_UNICODE_TITLECASE_LETTER, \ |
||
84 | OR (G_UNICODE_MODIFIER_LETTER, \ |
||
85 | OR (G_UNICODE_OTHER_LETTER, 0))))))))) |
||
86 | |||
87 | #define ISMARK(Type) IS ((Type), \ |
||
88 | OR (G_UNICODE_NON_SPACING_MARK, \ |
||
89 | OR (G_UNICODE_SPACING_MARK, \ |
||
90 | OR (G_UNICODE_ENCLOSING_MARK, 0)))) |
||
91 | |||
92 | #define ISZEROWIDTHTYPE(Type) IS ((Type), \ |
||
93 | OR (G_UNICODE_NON_SPACING_MARK, \ |
||
94 | OR (G_UNICODE_ENCLOSING_MARK, \ |
||
95 | OR (G_UNICODE_FORMAT, 0)))) |
||
96 | |||
97 | /** |
||
98 | * g_unichar_isalnum: |
||
99 | * @c: a Unicode character |
||
100 | * |
||
101 | * Determines whether a character is alphanumeric. |
||
102 | * Given some UTF-8 text, obtain a character value |
||
103 | * with g_utf8_get_char(). |
||
104 | * |
||
105 | * Returns: %TRUE if @c is an alphanumeric character |
||
106 | **/ |
||
107 | gboolean |
||
108 | g_unichar_isalnum (gunichar c) |
||
109 | { |
||
110 | return ISALDIGIT (TYPE (c)) ? TRUE : FALSE; |
||
111 | } |
||
112 | |||
113 | /** |
||
114 | * g_unichar_isalpha: |
||
115 | * @c: a Unicode character |
||
116 | * |
||
117 | * Determines whether a character is alphabetic (i.e. a letter). |
||
118 | * Given some UTF-8 text, obtain a character value with |
||
119 | * g_utf8_get_char(). |
||
120 | * |
||
121 | * Returns: %TRUE if @c is an alphabetic character |
||
122 | **/ |
||
123 | gboolean |
||
124 | g_unichar_isalpha (gunichar c) |
||
125 | { |
||
126 | return ISALPHA (TYPE (c)) ? TRUE : FALSE; |
||
127 | } |
||
128 | |||
129 | |||
130 | /** |
||
131 | * g_unichar_iscntrl: |
||
132 | * @c: a Unicode character |
||
133 | * |
||
134 | * Determines whether a character is a control character. |
||
135 | * Given some UTF-8 text, obtain a character value with |
||
136 | * g_utf8_get_char(). |
||
137 | * |
||
138 | * Returns: %TRUE if @c is a control character |
||
139 | **/ |
||
140 | gboolean |
||
141 | g_unichar_iscntrl (gunichar c) |
||
142 | { |
||
143 | return TYPE (c) == G_UNICODE_CONTROL; |
||
144 | } |
||
145 | |||
146 | /** |
||
147 | * g_unichar_isdigit: |
||
148 | * @c: a Unicode character |
||
149 | * |
||
150 | * Determines whether a character is numeric (i.e. a digit). This |
||
151 | * covers ASCII 0-9 and also digits in other languages/scripts. Given |
||
152 | * some UTF-8 text, obtain a character value with g_utf8_get_char(). |
||
153 | * |
||
154 | * Returns: %TRUE if @c is a digit |
||
155 | **/ |
||
156 | gboolean |
||
157 | g_unichar_isdigit (gunichar c) |
||
158 | { |
||
159 | return TYPE (c) == G_UNICODE_DECIMAL_NUMBER; |
||
160 | } |
||
161 | |||
162 | |||
163 | /** |
||
164 | * g_unichar_isgraph: |
||
165 | * @c: a Unicode character |
||
166 | * |
||
167 | * Determines whether a character is printable and not a space |
||
168 | * (returns %FALSE for control characters, format characters, and |
||
169 | * spaces). g_unichar_isprint() is similar, but returns %TRUE for |
||
170 | * spaces. Given some UTF-8 text, obtain a character value with |
||
171 | * g_utf8_get_char(). |
||
172 | * |
||
173 | * Returns: %TRUE if @c is printable unless it's a space |
||
174 | **/ |
||
175 | gboolean |
||
176 | g_unichar_isgraph (gunichar c) |
||
177 | { |
||
178 | return !IS (TYPE(c), |
||
179 | OR (G_UNICODE_CONTROL, |
||
180 | OR (G_UNICODE_FORMAT, |
||
181 | OR (G_UNICODE_UNASSIGNED, |
||
182 | OR (G_UNICODE_SURROGATE, |
||
183 | OR (G_UNICODE_SPACE_SEPARATOR, |
||
184 | 0)))))); |
||
185 | } |
||
186 | |||
187 | /** |
||
188 | * g_unichar_islower: |
||
189 | * @c: a Unicode character |
||
190 | * |
||
191 | * Determines whether a character is a lowercase letter. |
||
192 | * Given some UTF-8 text, obtain a character value with |
||
193 | * g_utf8_get_char(). |
||
194 | * |
||
195 | * Returns: %TRUE if @c is a lowercase letter |
||
196 | **/ |
||
197 | gboolean |
||
198 | g_unichar_islower (gunichar c) |
||
199 | { |
||
200 | return TYPE (c) == G_UNICODE_LOWERCASE_LETTER; |
||
201 | } |
||
202 | |||
203 | |||
204 | /** |
||
205 | * g_unichar_isprint: |
||
206 | * @c: a Unicode character |
||
207 | * |
||
208 | * Determines whether a character is printable. |
||
209 | * Unlike g_unichar_isgraph(), returns %TRUE for spaces. |
||
210 | * Given some UTF-8 text, obtain a character value with |
||
211 | * g_utf8_get_char(). |
||
212 | * |
||
213 | * Returns: %TRUE if @c is printable |
||
214 | **/ |
||
215 | gboolean |
||
216 | g_unichar_isprint (gunichar c) |
||
217 | { |
||
218 | return !IS (TYPE(c), |
||
219 | OR (G_UNICODE_CONTROL, |
||
220 | OR (G_UNICODE_FORMAT, |
||
221 | OR (G_UNICODE_UNASSIGNED, |
||
222 | OR (G_UNICODE_SURROGATE, |
||
223 | 0))))); |
||
224 | } |
||
225 | |||
226 | /** |
||
227 | * g_unichar_ispunct: |
||
228 | * @c: a Unicode character |
||
229 | * |
||
230 | * Determines whether a character is punctuation or a symbol. |
||
231 | * Given some UTF-8 text, obtain a character value with |
||
232 | * g_utf8_get_char(). |
||
233 | * |
||
234 | * Returns: %TRUE if @c is a punctuation or symbol character |
||
235 | **/ |
||
236 | gboolean |
||
237 | g_unichar_ispunct (gunichar c) |
||
238 | { |
||
239 | return IS (TYPE(c), |
||
240 | OR (G_UNICODE_CONNECT_PUNCTUATION, |
||
241 | OR (G_UNICODE_DASH_PUNCTUATION, |
||
242 | OR (G_UNICODE_CLOSE_PUNCTUATION, |
||
243 | OR (G_UNICODE_FINAL_PUNCTUATION, |
||
244 | OR (G_UNICODE_INITIAL_PUNCTUATION, |
||
245 | OR (G_UNICODE_OTHER_PUNCTUATION, |
||
246 | OR (G_UNICODE_OPEN_PUNCTUATION, |
||
247 | OR (G_UNICODE_CURRENCY_SYMBOL, |
||
248 | OR (G_UNICODE_MODIFIER_SYMBOL, |
||
249 | OR (G_UNICODE_MATH_SYMBOL, |
||
250 | OR (G_UNICODE_OTHER_SYMBOL, |
||
251 | 0)))))))))))) ? TRUE : FALSE; |
||
252 | } |
||
253 | |||
254 | /** |
||
255 | * g_unichar_isspace: |
||
256 | * @c: a Unicode character |
||
257 | * |
||
258 | * Determines whether a character is a space, tab, or line separator |
||
259 | * (newline, carriage return, etc.). Given some UTF-8 text, obtain a |
||
260 | * character value with g_utf8_get_char(). |
||
261 | * |
||
262 | * (Note: don't use this to do word breaking; you have to use |
||
263 | * Pango or equivalent to get word breaking right, the algorithm |
||
264 | * is fairly complex.) |
||
265 | * |
||
266 | * Returns: %TRUE if @c is a space character |
||
267 | **/ |
||
268 | gboolean |
||
269 | g_unichar_isspace (gunichar c) |
||
270 | { |
||
271 | switch (c) |
||
272 | { |
||
273 | /* special-case these since Unicode thinks they are not spaces */ |
||
274 | case '\t': |
||
275 | case '\n': |
||
276 | case '\r': |
||
277 | case '\f': |
||
278 | return TRUE; |
||
279 | break; |
||
280 | |||
281 | default: |
||
282 | { |
||
283 | return IS (TYPE(c), |
||
284 | OR (G_UNICODE_SPACE_SEPARATOR, |
||
285 | OR (G_UNICODE_LINE_SEPARATOR, |
||
286 | OR (G_UNICODE_PARAGRAPH_SEPARATOR, |
||
287 | 0)))) ? TRUE : FALSE; |
||
288 | } |
||
289 | break; |
||
290 | } |
||
291 | } |
||
292 | |||
293 | /** |
||
294 | * g_unichar_ismark: |
||
295 | * @c: a Unicode character |
||
296 | * |
||
297 | * Determines whether a character is a mark (non-spacing mark, |
||
298 | * combining mark, or enclosing mark in Unicode speak). |
||
299 | * Given some UTF-8 text, obtain a character value |
||
300 | * with g_utf8_get_char(). |
||
301 | * |
||
302 | * Note: in most cases where isalpha characters are allowed, |
||
303 | * ismark characters should be allowed to as they are essential |
||
304 | * for writing most European languages as well as many non-Latin |
||
305 | * scripts. |
||
306 | * |
||
307 | * Returns: %TRUE if @c is a mark character |
||
308 | * |
||
309 | * Since: 2.14 |
||
310 | **/ |
||
311 | gboolean |
||
312 | g_unichar_ismark (gunichar c) |
||
313 | { |
||
314 | return ISMARK (TYPE (c)); |
||
315 | } |
||
316 | |||
317 | /** |
||
318 | * g_unichar_isupper: |
||
319 | * @c: a Unicode character |
||
320 | * |
||
321 | * Determines if a character is uppercase. |
||
322 | * |
||
323 | * Returns: %TRUE if @c is an uppercase character |
||
324 | **/ |
||
325 | gboolean |
||
326 | g_unichar_isupper (gunichar c) |
||
327 | { |
||
328 | return TYPE (c) == G_UNICODE_UPPERCASE_LETTER; |
||
329 | } |
||
330 | |||
331 | /** |
||
332 | * g_unichar_istitle: |
||
333 | * @c: a Unicode character |
||
334 | * |
||
335 | * Determines if a character is titlecase. Some characters in |
||
336 | * Unicode which are composites, such as the DZ digraph |
||
337 | * have three case variants instead of just two. The titlecase |
||
338 | * form is used at the beginning of a word where only the |
||
339 | * first letter is capitalized. The titlecase form of the DZ |
||
340 | * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z. |
||
341 | * |
||
342 | * Returns: %TRUE if the character is titlecase |
||
343 | **/ |
||
344 | gboolean |
||
345 | g_unichar_istitle (gunichar c) |
||
346 | { |
||
347 | unsigned int i; |
||
348 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
||
349 | if (title_table[i][0] == c) |
||
350 | return TRUE; |
||
351 | return FALSE; |
||
352 | } |
||
353 | |||
354 | /** |
||
355 | * g_unichar_isxdigit: |
||
356 | * @c: a Unicode character. |
||
357 | * |
||
358 | * Determines if a character is a hexidecimal digit. |
||
359 | * |
||
360 | * Returns: %TRUE if the character is a hexadecimal digit |
||
361 | **/ |
||
362 | gboolean |
||
363 | g_unichar_isxdigit (gunichar c) |
||
364 | { |
||
365 | return ((c >= 'a' && c <= 'f') |
||
366 | || (c >= 'A' && c <= 'F') |
||
367 | || (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)); |
||
368 | } |
||
369 | |||
370 | /** |
||
371 | * g_unichar_isdefined: |
||
372 | * @c: a Unicode character |
||
373 | * |
||
374 | * Determines if a given character is assigned in the Unicode |
||
375 | * standard. |
||
376 | * |
||
377 | * Returns: %TRUE if the character has an assigned value |
||
378 | **/ |
||
379 | gboolean |
||
380 | g_unichar_isdefined (gunichar c) |
||
381 | { |
||
382 | return !IS (TYPE(c), |
||
383 | OR (G_UNICODE_UNASSIGNED, |
||
384 | OR (G_UNICODE_SURROGATE, |
||
385 | 0))); |
||
386 | } |
||
387 | |||
388 | /** |
||
389 | * g_unichar_iszerowidth: |
||
390 | * @c: a Unicode character |
||
391 | * |
||
392 | * Determines if a given character typically takes zero width when rendered. |
||
393 | * The return value is %TRUE for all non-spacing and enclosing marks |
||
394 | * (e.g., combining accents), format characters, zero-width |
||
395 | * space, but not U+00AD SOFT HYPHEN. |
||
396 | * |
||
397 | * A typical use of this function is with one of g_unichar_iswide() or |
||
398 | * g_unichar_iswide_cjk() to determine the number of cells a string occupies |
||
399 | * when displayed on a grid display (terminals). However, note that not all |
||
400 | * terminals support zero-width rendering of zero-width marks. |
||
401 | * |
||
402 | * Returns: %TRUE if the character has zero width |
||
403 | * |
||
404 | * Since: 2.14 |
||
405 | **/ |
||
406 | gboolean |
||
407 | g_unichar_iszerowidth (gunichar c) |
||
408 | { |
||
409 | if (G_UNLIKELY (c == 0x00AD)) |
||
410 | return FALSE; |
||
411 | |||
412 | if (G_UNLIKELY (ISZEROWIDTHTYPE (TYPE (c)))) |
||
413 | return TRUE; |
||
414 | |||
415 | if (G_UNLIKELY ((c >= 0x1160 && c < 0x1200) || |
||
416 | c == 0x200B)) |
||
417 | return TRUE; |
||
418 | |||
419 | return FALSE; |
||
420 | } |
||
421 | |||
422 | static int |
||
423 | interval_compare (const void *key, const void *elt) |
||
424 | { |
||
425 | gunichar c = GPOINTER_TO_UINT (key); |
||
426 | struct Interval *interval = (struct Interval *)elt; |
||
427 | |||
428 | if (c < interval->start) |
||
429 | return -1; |
||
430 | if (c > interval->end) |
||
431 | return +1; |
||
432 | |||
433 | return 0; |
||
434 | } |
||
435 | |||
436 | #define G_WIDTH_TABLE_MIDPOINT (G_N_ELEMENTS (g_unicode_width_table_wide) / 2) |
||
437 | |||
438 | static inline gboolean |
||
439 | g_unichar_iswide_bsearch (gunichar ch) |
||
440 | { |
||
441 | int lower = 0; |
||
442 | int upper = G_N_ELEMENTS (g_unicode_width_table_wide) + 1; |
||
443 | static int saved_mid = G_WIDTH_TABLE_MIDPOINT; |
||
444 | int mid = saved_mid; |
||
445 | |||
446 | do |
||
447 | { |
||
448 | if (ch < g_unicode_width_table_wide[mid].start) |
||
449 | upper = mid - 1; |
||
450 | else if (ch > g_unicode_width_table_wide[mid].end) |
||
451 | lower = mid + 1; |
||
452 | else |
||
453 | return TRUE; |
||
454 | |||
455 | mid = (lower + upper) / 2; |
||
456 | } |
||
457 | while (lower <= upper); |
||
458 | |||
459 | return FALSE; |
||
460 | } |
||
461 | |||
462 | /** |
||
463 | * g_unichar_iswide: |
||
464 | * @c: a Unicode character |
||
465 | * |
||
466 | * Determines if a character is typically rendered in a double-width |
||
467 | * cell. |
||
468 | * |
||
469 | * Returns: %TRUE if the character is wide |
||
470 | **/ |
||
471 | gboolean |
||
472 | g_unichar_iswide (gunichar c) |
||
473 | { |
||
474 | if (c < g_unicode_width_table_wide[0].start) |
||
475 | return FALSE; |
||
476 | else |
||
477 | return g_unichar_iswide_bsearch (c); |
||
478 | } |
||
479 | |||
480 | |||
481 | /** |
||
482 | * g_unichar_iswide_cjk: |
||
483 | * @c: a Unicode character |
||
484 | * |
||
485 | * Determines if a character is typically rendered in a double-width |
||
486 | * cell under legacy East Asian locales. If a character is wide according to |
||
487 | * g_unichar_iswide(), then it is also reported wide with this function, but |
||
488 | * the converse is not necessarily true. See the |
||
489 | * [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) |
||
490 | * for details. |
||
491 | * |
||
492 | * If a character passes the g_unichar_iswide() test then it will also pass |
||
493 | * this test, but not the other way around. Note that some characters may |
||
494 | * pass both this test and g_unichar_iszerowidth(). |
||
495 | * |
||
496 | * Returns: %TRUE if the character is wide in legacy East Asian locales |
||
497 | * |
||
498 | * Since: 2.12 |
||
499 | */ |
||
500 | gboolean |
||
501 | g_unichar_iswide_cjk (gunichar c) |
||
502 | { |
||
503 | if (g_unichar_iswide (c)) |
||
504 | return TRUE; |
||
505 | |||
506 | if (bsearch (GUINT_TO_POINTER (c), |
||
507 | g_unicode_width_table_ambiguous, |
||
508 | G_N_ELEMENTS (g_unicode_width_table_ambiguous), |
||
509 | sizeof g_unicode_width_table_ambiguous[0], |
||
510 | interval_compare)) |
||
511 | return TRUE; |
||
512 | |||
513 | return FALSE; |
||
514 | } |
||
515 | |||
516 | |||
517 | /** |
||
518 | * g_unichar_toupper: |
||
519 | * @c: a Unicode character |
||
520 | * |
||
521 | * Converts a character to uppercase. |
||
522 | * |
||
523 | * Returns: the result of converting @c to uppercase. |
||
524 | * If @c is not an lowercase or titlecase character, |
||
525 | * or has no upper case equivalent @c is returned unchanged. |
||
526 | **/ |
||
527 | gunichar |
||
528 | g_unichar_toupper (gunichar c) |
||
529 | { |
||
530 | int t = TYPE (c); |
||
531 | if (t == G_UNICODE_LOWERCASE_LETTER) |
||
532 | { |
||
533 | gunichar val = ATTTABLE (c >> 8, c & 0xff); |
||
534 | if (val >= 0x1000000) |
||
535 | { |
||
536 | const gchar *p = special_case_table + val - 0x1000000; |
||
537 | val = g_utf8_get_char (p); |
||
538 | } |
||
539 | /* Some lowercase letters, e.g., U+000AA, FEMININE ORDINAL INDICATOR, |
||
540 | * do not have an uppercase equivalent, in which case val will be |
||
541 | * zero. |
||
542 | */ |
||
543 | return val ? val : c; |
||
544 | } |
||
545 | else if (t == G_UNICODE_TITLECASE_LETTER) |
||
546 | { |
||
547 | unsigned int i; |
||
548 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
||
549 | { |
||
550 | if (title_table[i][0] == c) |
||
551 | return title_table[i][1] ? title_table[i][1] : c; |
||
552 | } |
||
553 | } |
||
554 | return c; |
||
555 | } |
||
556 | |||
557 | /** |
||
558 | * g_unichar_tolower: |
||
559 | * @c: a Unicode character. |
||
560 | * |
||
561 | * Converts a character to lower case. |
||
562 | * |
||
563 | * Returns: the result of converting @c to lower case. |
||
564 | * If @c is not an upperlower or titlecase character, |
||
565 | * or has no lowercase equivalent @c is returned unchanged. |
||
566 | **/ |
||
567 | gunichar |
||
568 | g_unichar_tolower (gunichar c) |
||
569 | { |
||
570 | int t = TYPE (c); |
||
571 | if (t == G_UNICODE_UPPERCASE_LETTER) |
||
572 | { |
||
573 | gunichar val = ATTTABLE (c >> 8, c & 0xff); |
||
574 | if (val >= 0x1000000) |
||
575 | { |
||
576 | const gchar *p = special_case_table + val - 0x1000000; |
||
577 | return g_utf8_get_char (p); |
||
578 | } |
||
579 | else |
||
580 | { |
||
581 | /* Not all uppercase letters are guaranteed to have a lowercase |
||
582 | * equivalent. If this is the case, val will be zero. */ |
||
583 | return val ? val : c; |
||
584 | } |
||
585 | } |
||
586 | else if (t == G_UNICODE_TITLECASE_LETTER) |
||
587 | { |
||
588 | unsigned int i; |
||
589 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
||
590 | { |
||
591 | if (title_table[i][0] == c) |
||
592 | return title_table[i][2]; |
||
593 | } |
||
594 | } |
||
595 | return c; |
||
596 | } |
||
597 | |||
598 | /** |
||
599 | * g_unichar_totitle: |
||
600 | * @c: a Unicode character |
||
601 | * |
||
602 | * Converts a character to the titlecase. |
||
603 | * |
||
604 | * Returns: the result of converting @c to titlecase. |
||
605 | * If @c is not an uppercase or lowercase character, |
||
606 | * @c is returned unchanged. |
||
607 | **/ |
||
608 | gunichar |
||
609 | g_unichar_totitle (gunichar c) |
||
610 | { |
||
611 | unsigned int i; |
||
612 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
||
613 | { |
||
614 | if (title_table[i][0] == c || title_table[i][1] == c |
||
615 | || title_table[i][2] == c) |
||
616 | return title_table[i][0]; |
||
617 | } |
||
618 | |||
619 | if (TYPE (c) == G_UNICODE_LOWERCASE_LETTER) |
||
620 | return g_unichar_toupper (c); |
||
621 | |||
622 | return c; |
||
623 | } |
||
624 | |||
625 | /** |
||
626 | * g_unichar_digit_value: |
||
627 | * @c: a Unicode character |
||
628 | * |
||
629 | * Determines the numeric value of a character as a decimal |
||
630 | * digit. |
||
631 | * |
||
632 | * Returns: If @c is a decimal digit (according to |
||
633 | * g_unichar_isdigit()), its numeric value. Otherwise, -1. |
||
634 | **/ |
||
635 | int |
||
636 | g_unichar_digit_value (gunichar c) |
||
637 | { |
||
638 | if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) |
||
639 | return ATTTABLE (c >> 8, c & 0xff); |
||
640 | return -1; |
||
641 | } |
||
642 | |||
643 | /** |
||
644 | * g_unichar_xdigit_value: |
||
645 | * @c: a Unicode character |
||
646 | * |
||
647 | * Determines the numeric value of a character as a hexidecimal |
||
648 | * digit. |
||
649 | * |
||
650 | * Returns: If @c is a hex digit (according to |
||
651 | * g_unichar_isxdigit()), its numeric value. Otherwise, -1. |
||
652 | **/ |
||
653 | int |
||
654 | g_unichar_xdigit_value (gunichar c) |
||
655 | { |
||
656 | if (c >= 'A' && c <= 'F') |
||
657 | return c - 'A' + 10; |
||
658 | if (c >= 'a' && c <= 'f') |
||
659 | return c - 'a' + 10; |
||
660 | if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) |
||
661 | return ATTTABLE (c >> 8, c & 0xff); |
||
662 | return -1; |
||
663 | } |
||
664 | |||
665 | /** |
||
666 | * g_unichar_type: |
||
667 | * @c: a Unicode character |
||
668 | * |
||
669 | * Classifies a Unicode character by type. |
||
670 | * |
||
671 | * Returns: the type of the character. |
||
672 | **/ |
||
673 | GUnicodeType |
||
674 | g_unichar_type (gunichar c) |
||
675 | { |
||
676 | return TYPE (c); |
||
677 | } |
||
678 | |||
679 | /* |
||
680 | * Case mapping functions |
||
681 | */ |
||
682 | |||
683 | typedef enum { |
||
684 | LOCALE_NORMAL, |
||
685 | LOCALE_TURKIC, |
||
686 | LOCALE_LITHUANIAN |
||
687 | } LocaleType; |
||
688 | |||
689 | static LocaleType |
||
690 | get_locale_type (void) |
||
691 | { |
||
692 | #ifdef G_OS_WIN32 |
||
693 | char *tem = g_win32_getlocale (); |
||
694 | char locale[2]; |
||
695 | |||
696 | locale[0] = tem[0]; |
||
697 | locale[1] = tem[1]; |
||
698 | g_free (tem); |
||
699 | #else |
||
700 | const char *locale = setlocale (LC_CTYPE, NULL); |
||
701 | |||
702 | if (locale == NULL) |
||
703 | return LOCALE_NORMAL; |
||
704 | #endif |
||
705 | |||
706 | switch (locale[0]) |
||
707 | { |
||
708 | case 'a': |
||
709 | if (locale[1] == 'z') |
||
710 | return LOCALE_TURKIC; |
||
711 | break; |
||
712 | case 'l': |
||
713 | if (locale[1] == 't') |
||
714 | return LOCALE_LITHUANIAN; |
||
715 | break; |
||
716 | case 't': |
||
717 | if (locale[1] == 'r') |
||
718 | return LOCALE_TURKIC; |
||
719 | break; |
||
720 | } |
||
721 | |||
722 | return LOCALE_NORMAL; |
||
723 | } |
||
724 | |||
725 | static gint |
||
726 | output_marks (const char **p_inout, |
||
727 | char *out_buffer, |
||
728 | gboolean remove_dot) |
||
729 | { |
||
730 | const char *p = *p_inout; |
||
731 | gint len = 0; |
||
732 | |||
733 | while (*p) |
||
734 | { |
||
735 | gunichar c = g_utf8_get_char (p); |
||
736 | |||
737 | if (ISMARK (TYPE (c))) |
||
738 | { |
||
739 | if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */) |
||
740 | len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL); |
||
741 | p = g_utf8_next_char (p); |
||
742 | } |
||
743 | else |
||
744 | break; |
||
745 | } |
||
746 | |||
747 | *p_inout = p; |
||
748 | return len; |
||
749 | } |
||
750 | |||
751 | static gint |
||
752 | output_special_case (gchar *out_buffer, |
||
753 | int offset, |
||
754 | int type, |
||
755 | int which) |
||
756 | { |
||
757 | const gchar *p = special_case_table + offset; |
||
758 | gint len; |
||
759 | |||
760 | if (type != G_UNICODE_TITLECASE_LETTER) |
||
761 | p = g_utf8_next_char (p); |
||
762 | |||
763 | if (which == 1) |
||
764 | p += strlen (p) + 1; |
||
765 | |||
766 | len = strlen (p); |
||
767 | if (out_buffer) |
||
768 | memcpy (out_buffer, p, len); |
||
769 | |||
770 | return len; |
||
771 | } |
||
772 | |||
773 | static gsize |
||
774 | real_toupper (const gchar *str, |
||
775 | gssize max_len, |
||
776 | gchar *out_buffer, |
||
777 | LocaleType locale_type) |
||
778 | { |
||
779 | const gchar *p = str; |
||
780 | const char *last = NULL; |
||
781 | gsize len = 0; |
||
782 | gboolean last_was_i = FALSE; |
||
783 | |||
784 | while ((max_len < 0 || p < str + max_len) && *p) |
||
785 | { |
||
786 | gunichar c = g_utf8_get_char (p); |
||
787 | int t = TYPE (c); |
||
788 | gunichar val; |
||
789 | |||
790 | last = p; |
||
791 | p = g_utf8_next_char (p); |
||
792 | |||
793 | if (locale_type == LOCALE_LITHUANIAN) |
||
794 | { |
||
795 | if (c == 'i') |
||
796 | last_was_i = TRUE; |
||
797 | else |
||
798 | { |
||
799 | if (last_was_i) |
||
800 | { |
||
801 | /* Nasty, need to remove any dot above. Though |
||
802 | * I think only E WITH DOT ABOVE occurs in practice |
||
803 | * which could simplify this considerably. |
||
804 | */ |
||
805 | gsize decomp_len, i; |
||
806 | gunichar decomp[G_UNICHAR_MAX_DECOMPOSITION_LENGTH]; |
||
807 | |||
808 | decomp_len = g_unichar_fully_decompose (c, FALSE, decomp, G_N_ELEMENTS (decomp)); |
||
809 | for (i=0; i < decomp_len; i++) |
||
810 | { |
||
811 | if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */) |
||
812 | len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL); |
||
813 | } |
||
814 | |||
815 | len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE); |
||
816 | |||
817 | continue; |
||
818 | } |
||
819 | |||
820 | if (!ISMARK (t)) |
||
821 | last_was_i = FALSE; |
||
822 | } |
||
823 | } |
||
824 | |||
825 | if (locale_type == LOCALE_TURKIC && c == 'i') |
||
826 | { |
||
827 | /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */ |
||
828 | len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL); |
||
829 | } |
||
830 | else if (c == 0x0345) /* COMBINING GREEK YPOGEGRAMMENI */ |
||
831 | { |
||
832 | /* Nasty, need to move it after other combining marks .. this would go away if |
||
833 | * we normalized first. |
||
834 | */ |
||
835 | len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE); |
||
836 | |||
837 | /* And output as GREEK CAPITAL LETTER IOTA */ |
||
838 | len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL); |
||
839 | } |
||
840 | else if (IS (t, |
||
841 | OR (G_UNICODE_LOWERCASE_LETTER, |
||
842 | OR (G_UNICODE_TITLECASE_LETTER, |
||
843 | 0)))) |
||
844 | { |
||
845 | val = ATTTABLE (c >> 8, c & 0xff); |
||
846 | |||
847 | if (val >= 0x1000000) |
||
848 | { |
||
849 | len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, |
||
850 | t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1); |
||
851 | } |
||
852 | else |
||
853 | { |
||
854 | if (t == G_UNICODE_TITLECASE_LETTER) |
||
855 | { |
||
856 | unsigned int i; |
||
857 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
||
858 | { |
||
859 | if (title_table[i][0] == c) |
||
860 | { |
||
861 | val = title_table[i][1]; |
||
862 | break; |
||
863 | } |
||
864 | } |
||
865 | } |
||
866 | |||
867 | /* Some lowercase letters, e.g., U+000AA, FEMININE ORDINAL INDICATOR, |
||
868 | * do not have an uppercase equivalent, in which case val will be |
||
869 | * zero. */ |
||
870 | len += g_unichar_to_utf8 (val ? val : c, out_buffer ? out_buffer + len : NULL); |
||
871 | } |
||
872 | } |
||
873 | else |
||
874 | { |
||
875 | gsize char_len = g_utf8_skip[*(guchar *)last]; |
||
876 | |||
877 | if (out_buffer) |
||
878 | memcpy (out_buffer + len, last, char_len); |
||
879 | |||
880 | len += char_len; |
||
881 | } |
||
882 | |||
883 | } |
||
884 | |||
885 | return len; |
||
886 | } |
||
887 | |||
888 | /** |
||
889 | * g_utf8_strup: |
||
890 | * @str: a UTF-8 encoded string |
||
891 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
||
892 | * |
||
893 | * Converts all Unicode characters in the string that have a case |
||
894 | * to uppercase. The exact manner that this is done depends |
||
895 | * on the current locale, and may result in the number of |
||
896 | * characters in the string increasing. (For instance, the |
||
897 | * German ess-zet will be changed to SS.) |
||
898 | * |
||
899 | * Returns: a newly allocated string, with all characters |
||
900 | * converted to uppercase. |
||
901 | **/ |
||
902 | gchar * |
||
903 | g_utf8_strup (const gchar *str, |
||
904 | gssize len) |
||
905 | { |
||
906 | gsize result_len; |
||
907 | LocaleType locale_type; |
||
908 | gchar *result; |
||
909 | |||
910 | g_return_val_if_fail (str != NULL, NULL); |
||
911 | |||
912 | locale_type = get_locale_type (); |
||
913 | |||
914 | /* |
||
915 | * We use a two pass approach to keep memory management simple |
||
916 | */ |
||
917 | result_len = real_toupper (str, len, NULL, locale_type); |
||
918 | result = g_malloc (result_len + 1); |
||
919 | real_toupper (str, len, result, locale_type); |
||
920 | result[result_len] = '\0'; |
||
921 | |||
922 | return result; |
||
923 | } |
||
924 | |||
925 | /* traverses the string checking for characters with combining class == 230 |
||
926 | * until a base character is found */ |
||
927 | static gboolean |
||
928 | has_more_above (const gchar *str) |
||
929 | { |
||
930 | const gchar *p = str; |
||
931 | gint combining_class; |
||
932 | |||
933 | while (*p) |
||
934 | { |
||
935 | combining_class = g_unichar_combining_class (g_utf8_get_char (p)); |
||
936 | if (combining_class == 230) |
||
937 | return TRUE; |
||
938 | else if (combining_class == 0) |
||
939 | break; |
||
940 | |||
941 | p = g_utf8_next_char (p); |
||
942 | } |
||
943 | |||
944 | return FALSE; |
||
945 | } |
||
946 | |||
947 | static gsize |
||
948 | real_tolower (const gchar *str, |
||
949 | gssize max_len, |
||
950 | gchar *out_buffer, |
||
951 | LocaleType locale_type) |
||
952 | { |
||
953 | const gchar *p = str; |
||
954 | const char *last = NULL; |
||
955 | gsize len = 0; |
||
956 | |||
957 | while ((max_len < 0 || p < str + max_len) && *p) |
||
958 | { |
||
959 | gunichar c = g_utf8_get_char (p); |
||
960 | int t = TYPE (c); |
||
961 | gunichar val; |
||
962 | |||
963 | last = p; |
||
964 | p = g_utf8_next_char (p); |
||
965 | |||
966 | if (locale_type == LOCALE_TURKIC && c == 'I') |
||
967 | { |
||
968 | if (g_utf8_get_char (p) == 0x0307) |
||
969 | { |
||
970 | /* I + COMBINING DOT ABOVE => i (U+0069) */ |
||
971 | len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); |
||
972 | p = g_utf8_next_char (p); |
||
973 | } |
||
974 | else |
||
975 | { |
||
976 | /* I => LATIN SMALL LETTER DOTLESS I */ |
||
977 | len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL); |
||
978 | } |
||
979 | } |
||
980 | /* Introduce an explicit dot above when lowercasing capital I's and J's |
||
981 | * whenever there are more accents above. [SpecialCasing.txt] */ |
||
982 | else if (locale_type == LOCALE_LITHUANIAN && |
||
983 | (c == 0x00cc || c == 0x00cd || c == 0x0128)) |
||
984 | { |
||
985 | len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); |
||
986 | len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); |
||
987 | |||
988 | switch (c) |
||
989 | { |
||
990 | case 0x00cc: |
||
991 | len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL); |
||
992 | break; |
||
993 | case 0x00cd: |
||
994 | len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL); |
||
995 | break; |
||
996 | case 0x0128: |
||
997 | len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL); |
||
998 | break; |
||
999 | } |
||
1000 | } |
||
1001 | else if (locale_type == LOCALE_LITHUANIAN && |
||
1002 | (c == 'I' || c == 'J' || c == 0x012e) && |
||
1003 | has_more_above (p)) |
||
1004 | { |
||
1005 | len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL); |
||
1006 | len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); |
||
1007 | } |
||
1008 | else if (c == 0x03A3) /* GREEK CAPITAL LETTER SIGMA */ |
||
1009 | { |
||
1010 | if ((max_len < 0 || p < str + max_len) && *p) |
||
1011 | { |
||
1012 | gunichar next_c = g_utf8_get_char (p); |
||
1013 | int next_type = TYPE(next_c); |
||
1014 | |||
1015 | /* SIGMA mapps differently depending on whether it is |
||
1016 | * final or not. The following simplified test would |
||
1017 | * fail in the case of combining marks following the |
||
1018 | * sigma, but I don't think that occurs in real text. |
||
1019 | * The test here matches that in ICU. |
||
1020 | */ |
||
1021 | if (ISALPHA (next_type)) /* Lu,Ll,Lt,Lm,Lo */ |
||
1022 | val = 0x3c3; /* GREEK SMALL SIGMA */ |
||
1023 | else |
||
1024 | val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ |
||
1025 | } |
||
1026 | else |
||
1027 | val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ |
||
1028 | |||
1029 | len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL); |
||
1030 | } |
||
1031 | else if (IS (t, |
||
1032 | OR (G_UNICODE_UPPERCASE_LETTER, |
||
1033 | OR (G_UNICODE_TITLECASE_LETTER, |
||
1034 | 0)))) |
||
1035 | { |
||
1036 | val = ATTTABLE (c >> 8, c & 0xff); |
||
1037 | |||
1038 | if (val >= 0x1000000) |
||
1039 | { |
||
1040 | len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, 0); |
||
1041 | } |
||
1042 | else |
||
1043 | { |
||
1044 | if (t == G_UNICODE_TITLECASE_LETTER) |
||
1045 | { |
||
1046 | unsigned int i; |
||
1047 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
||
1048 | { |
||
1049 | if (title_table[i][0] == c) |
||
1050 | { |
||
1051 | val = title_table[i][2]; |
||
1052 | break; |
||
1053 | } |
||
1054 | } |
||
1055 | } |
||
1056 | |||
1057 | /* Not all uppercase letters are guaranteed to have a lowercase |
||
1058 | * equivalent. If this is the case, val will be zero. */ |
||
1059 | len += g_unichar_to_utf8 (val ? val : c, out_buffer ? out_buffer + len : NULL); |
||
1060 | } |
||
1061 | } |
||
1062 | else |
||
1063 | { |
||
1064 | gsize char_len = g_utf8_skip[*(guchar *)last]; |
||
1065 | |||
1066 | if (out_buffer) |
||
1067 | memcpy (out_buffer + len, last, char_len); |
||
1068 | |||
1069 | len += char_len; |
||
1070 | } |
||
1071 | |||
1072 | } |
||
1073 | |||
1074 | return len; |
||
1075 | } |
||
1076 | |||
1077 | /** |
||
1078 | * g_utf8_strdown: |
||
1079 | * @str: a UTF-8 encoded string |
||
1080 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
||
1081 | * |
||
1082 | * Converts all Unicode characters in the string that have a case |
||
1083 | * to lowercase. The exact manner that this is done depends |
||
1084 | * on the current locale, and may result in the number of |
||
1085 | * characters in the string changing. |
||
1086 | * |
||
1087 | * Returns: a newly allocated string, with all characters |
||
1088 | * converted to lowercase. |
||
1089 | **/ |
||
1090 | gchar * |
||
1091 | g_utf8_strdown (const gchar *str, |
||
1092 | gssize len) |
||
1093 | { |
||
1094 | gsize result_len; |
||
1095 | LocaleType locale_type; |
||
1096 | gchar *result; |
||
1097 | |||
1098 | g_return_val_if_fail (str != NULL, NULL); |
||
1099 | |||
1100 | locale_type = get_locale_type (); |
||
1101 | |||
1102 | /* |
||
1103 | * We use a two pass approach to keep memory management simple |
||
1104 | */ |
||
1105 | result_len = real_tolower (str, len, NULL, locale_type); |
||
1106 | result = g_malloc (result_len + 1); |
||
1107 | real_tolower (str, len, result, locale_type); |
||
1108 | result[result_len] = '\0'; |
||
1109 | |||
1110 | return result; |
||
1111 | } |
||
1112 | |||
1113 | /** |
||
1114 | * g_utf8_casefold: |
||
1115 | * @str: a UTF-8 encoded string |
||
1116 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
||
1117 | * |
||
1118 | * Converts a string into a form that is independent of case. The |
||
1119 | * result will not correspond to any particular case, but can be |
||
1120 | * compared for equality or ordered with the results of calling |
||
1121 | * g_utf8_casefold() on other strings. |
||
1122 | * |
||
1123 | * Note that calling g_utf8_casefold() followed by g_utf8_collate() is |
||
1124 | * only an approximation to the correct linguistic case insensitive |
||
1125 | * ordering, though it is a fairly good one. Getting this exactly |
||
1126 | * right would require a more sophisticated collation function that |
||
1127 | * takes case sensitivity into account. GLib does not currently |
||
1128 | * provide such a function. |
||
1129 | * |
||
1130 | * Returns: a newly allocated string, that is a |
||
1131 | * case independent form of @str. |
||
1132 | **/ |
||
1133 | gchar * |
||
1134 | g_utf8_casefold (const gchar *str, |
||
1135 | gssize len) |
||
1136 | { |
||
1137 | GString *result; |
||
1138 | const char *p; |
||
1139 | |||
1140 | g_return_val_if_fail (str != NULL, NULL); |
||
1141 | |||
1142 | result = g_string_new (NULL); |
||
1143 | p = str; |
||
1144 | while ((len < 0 || p < str + len) && *p) |
||
1145 | { |
||
1146 | gunichar ch = g_utf8_get_char (p); |
||
1147 | |||
1148 | int start = 0; |
||
1149 | int end = G_N_ELEMENTS (casefold_table); |
||
1150 | |||
1151 | if (ch >= casefold_table[start].ch && |
||
1152 | ch <= casefold_table[end - 1].ch) |
||
1153 | { |
||
1154 | while (TRUE) |
||
1155 | { |
||
1156 | int half = (start + end) / 2; |
||
1157 | if (ch == casefold_table[half].ch) |
||
1158 | { |
||
1159 | g_string_append (result, casefold_table[half].data); |
||
1160 | goto next; |
||
1161 | } |
||
1162 | else if (half == start) |
||
1163 | break; |
||
1164 | else if (ch > casefold_table[half].ch) |
||
1165 | start = half; |
||
1166 | else |
||
1167 | end = half; |
||
1168 | } |
||
1169 | } |
||
1170 | |||
1171 | g_string_append_unichar (result, g_unichar_tolower (ch)); |
||
1172 | |||
1173 | next: |
||
1174 | p = g_utf8_next_char (p); |
||
1175 | } |
||
1176 | |||
1177 | return g_string_free (result, FALSE); |
||
1178 | } |
||
1179 | |||
1180 | /** |
||
1181 | * g_unichar_get_mirror_char: |
||
1182 | * @ch: a Unicode character |
||
1183 | * @mirrored_ch: location to store the mirrored character |
||
1184 | * |
||
1185 | * In Unicode, some characters are "mirrored". This means that their |
||
1186 | * images are mirrored horizontally in text that is laid out from right |
||
1187 | * to left. For instance, "(" would become its mirror image, ")", in |
||
1188 | * right-to-left text. |
||
1189 | * |
||
1190 | * If @ch has the Unicode mirrored property and there is another unicode |
||
1191 | * character that typically has a glyph that is the mirror image of @ch's |
||
1192 | * glyph and @mirrored_ch is set, it puts that character in the address |
||
1193 | * pointed to by @mirrored_ch. Otherwise the original character is put. |
||
1194 | * |
||
1195 | * Returns: %TRUE if @ch has a mirrored character, %FALSE otherwise |
||
1196 | * |
||
1197 | * Since: 2.4 |
||
1198 | **/ |
||
1199 | gboolean |
||
1200 | g_unichar_get_mirror_char (gunichar ch, |
||
1201 | gunichar *mirrored_ch) |
||
1202 | { |
||
1203 | gboolean found; |
||
1204 | gunichar mirrored; |
||
1205 | |||
1206 | mirrored = GLIB_GET_MIRRORING(ch); |
||
1207 | |||
1208 | found = ch != mirrored; |
||
1209 | if (mirrored_ch) |
||
1210 | *mirrored_ch = mirrored; |
||
1211 | |||
1212 | return found; |
||
1213 | |||
1214 | } |
||
1215 | |||
1216 | #define G_SCRIPT_TABLE_MIDPOINT (G_N_ELEMENTS (g_script_table) / 2) |
||
1217 | |||
1218 | static inline GUnicodeScript |
||
1219 | g_unichar_get_script_bsearch (gunichar ch) |
||
1220 | { |
||
1221 | int lower = 0; |
||
1222 | int upper = G_N_ELEMENTS (g_script_table) - 1; |
||
1223 | static int saved_mid = G_SCRIPT_TABLE_MIDPOINT; |
||
1224 | int mid = saved_mid; |
||
1225 | |||
1226 | |||
1227 | do |
||
1228 | { |
||
1229 | if (ch < g_script_table[mid].start) |
||
1230 | upper = mid - 1; |
||
1231 | else if (ch >= g_script_table[mid].start + g_script_table[mid].chars) |
||
1232 | lower = mid + 1; |
||
1233 | else |
||
1234 | return g_script_table[saved_mid = mid].script; |
||
1235 | |||
1236 | mid = (lower + upper) / 2; |
||
1237 | } |
||
1238 | while (lower <= upper); |
||
1239 | |||
1240 | return G_UNICODE_SCRIPT_UNKNOWN; |
||
1241 | } |
||
1242 | |||
1243 | /** |
||
1244 | * g_unichar_get_script: |
||
1245 | * @ch: a Unicode character |
||
1246 | * |
||
1247 | * Looks up the #GUnicodeScript for a particular character (as defined |
||
1248 | * by Unicode Standard Annex \#24). No check is made for @ch being a |
||
1249 | * valid Unicode character; if you pass in invalid character, the |
||
1250 | * result is undefined. |
||
1251 | * |
||
1252 | * This function is equivalent to pango_script_for_unichar() and the |
||
1253 | * two are interchangeable. |
||
1254 | * |
||
1255 | * Returns: the #GUnicodeScript for the character. |
||
1256 | * |
||
1257 | * Since: 2.14 |
||
1258 | */ |
||
1259 | GUnicodeScript |
||
1260 | g_unichar_get_script (gunichar ch) |
||
1261 | { |
||
1262 | if (ch < G_EASY_SCRIPTS_RANGE) |
||
1263 | return g_script_easy_table[ch]; |
||
1264 | else |
||
1265 | return g_unichar_get_script_bsearch (ch); |
||
1266 | } |
||
1267 | |||
1268 | |||
1269 | /* http://unicode.org/iso15924/ */ |
||
1270 | static const guint32 iso15924_tags[] = |
||
1271 | { |
||
1272 | #define PACK(a,b,c,d) ((guint32)((((guint8)(a))<<24)|(((guint8)(b))<<16)|(((guint8)(c))<<8)|((guint8)(d)))) |
||
1273 | |||
1274 | PACK ('Z','y','y','y'), /* G_UNICODE_SCRIPT_COMMON */ |
||
1275 | PACK ('Z','i','n','h'), /* G_UNICODE_SCRIPT_INHERITED */ |
||
1276 | PACK ('A','r','a','b'), /* G_UNICODE_SCRIPT_ARABIC */ |
||
1277 | PACK ('A','r','m','n'), /* G_UNICODE_SCRIPT_ARMENIAN */ |
||
1278 | PACK ('B','e','n','g'), /* G_UNICODE_SCRIPT_BENGALI */ |
||
1279 | PACK ('B','o','p','o'), /* G_UNICODE_SCRIPT_BOPOMOFO */ |
||
1280 | PACK ('C','h','e','r'), /* G_UNICODE_SCRIPT_CHEROKEE */ |
||
1281 | PACK ('C','o','p','t'), /* G_UNICODE_SCRIPT_COPTIC */ |
||
1282 | PACK ('C','y','r','l'), /* G_UNICODE_SCRIPT_CYRILLIC */ |
||
1283 | PACK ('D','s','r','t'), /* G_UNICODE_SCRIPT_DESERET */ |
||
1284 | PACK ('D','e','v','a'), /* G_UNICODE_SCRIPT_DEVANAGARI */ |
||
1285 | PACK ('E','t','h','i'), /* G_UNICODE_SCRIPT_ETHIOPIC */ |
||
1286 | PACK ('G','e','o','r'), /* G_UNICODE_SCRIPT_GEORGIAN */ |
||
1287 | PACK ('G','o','t','h'), /* G_UNICODE_SCRIPT_GOTHIC */ |
||
1288 | PACK ('G','r','e','k'), /* G_UNICODE_SCRIPT_GREEK */ |
||
1289 | PACK ('G','u','j','r'), /* G_UNICODE_SCRIPT_GUJARATI */ |
||
1290 | PACK ('G','u','r','u'), /* G_UNICODE_SCRIPT_GURMUKHI */ |
||
1291 | PACK ('H','a','n','i'), /* G_UNICODE_SCRIPT_HAN */ |
||
1292 | PACK ('H','a','n','g'), /* G_UNICODE_SCRIPT_HANGUL */ |
||
1293 | PACK ('H','e','b','r'), /* G_UNICODE_SCRIPT_HEBREW */ |
||
1294 | PACK ('H','i','r','a'), /* G_UNICODE_SCRIPT_HIRAGANA */ |
||
1295 | PACK ('K','n','d','a'), /* G_UNICODE_SCRIPT_KANNADA */ |
||
1296 | PACK ('K','a','n','a'), /* G_UNICODE_SCRIPT_KATAKANA */ |
||
1297 | PACK ('K','h','m','r'), /* G_UNICODE_SCRIPT_KHMER */ |
||
1298 | PACK ('L','a','o','o'), /* G_UNICODE_SCRIPT_LAO */ |
||
1299 | PACK ('L','a','t','n'), /* G_UNICODE_SCRIPT_LATIN */ |
||
1300 | PACK ('M','l','y','m'), /* G_UNICODE_SCRIPT_MALAYALAM */ |
||
1301 | PACK ('M','o','n','g'), /* G_UNICODE_SCRIPT_MONGOLIAN */ |
||
1302 | PACK ('M','y','m','r'), /* G_UNICODE_SCRIPT_MYANMAR */ |
||
1303 | PACK ('O','g','a','m'), /* G_UNICODE_SCRIPT_OGHAM */ |
||
1304 | PACK ('I','t','a','l'), /* G_UNICODE_SCRIPT_OLD_ITALIC */ |
||
1305 | PACK ('O','r','y','a'), /* G_UNICODE_SCRIPT_ORIYA */ |
||
1306 | PACK ('R','u','n','r'), /* G_UNICODE_SCRIPT_RUNIC */ |
||
1307 | PACK ('S','i','n','h'), /* G_UNICODE_SCRIPT_SINHALA */ |
||
1308 | PACK ('S','y','r','c'), /* G_UNICODE_SCRIPT_SYRIAC */ |
||
1309 | PACK ('T','a','m','l'), /* G_UNICODE_SCRIPT_TAMIL */ |
||
1310 | PACK ('T','e','l','u'), /* G_UNICODE_SCRIPT_TELUGU */ |
||
1311 | PACK ('T','h','a','a'), /* G_UNICODE_SCRIPT_THAANA */ |
||
1312 | PACK ('T','h','a','i'), /* G_UNICODE_SCRIPT_THAI */ |
||
1313 | PACK ('T','i','b','t'), /* G_UNICODE_SCRIPT_TIBETAN */ |
||
1314 | PACK ('C','a','n','s'), /* G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL */ |
||
1315 | PACK ('Y','i','i','i'), /* G_UNICODE_SCRIPT_YI */ |
||
1316 | PACK ('T','g','l','g'), /* G_UNICODE_SCRIPT_TAGALOG */ |
||
1317 | PACK ('H','a','n','o'), /* G_UNICODE_SCRIPT_HANUNOO */ |
||
1318 | PACK ('B','u','h','d'), /* G_UNICODE_SCRIPT_BUHID */ |
||
1319 | PACK ('T','a','g','b'), /* G_UNICODE_SCRIPT_TAGBANWA */ |
||
1320 | |||
1321 | /* Unicode-4.0 additions */ |
||
1322 | PACK ('B','r','a','i'), /* G_UNICODE_SCRIPT_BRAILLE */ |
||
1323 | PACK ('C','p','r','t'), /* G_UNICODE_SCRIPT_CYPRIOT */ |
||
1324 | PACK ('L','i','m','b'), /* G_UNICODE_SCRIPT_LIMBU */ |
||
1325 | PACK ('O','s','m','a'), /* G_UNICODE_SCRIPT_OSMANYA */ |
||
1326 | PACK ('S','h','a','w'), /* G_UNICODE_SCRIPT_SHAVIAN */ |
||
1327 | PACK ('L','i','n','b'), /* G_UNICODE_SCRIPT_LINEAR_B */ |
||
1328 | PACK ('T','a','l','e'), /* G_UNICODE_SCRIPT_TAI_LE */ |
||
1329 | PACK ('U','g','a','r'), /* G_UNICODE_SCRIPT_UGARITIC */ |
||
1330 | |||
1331 | /* Unicode-4.1 additions */ |
||
1332 | PACK ('T','a','l','u'), /* G_UNICODE_SCRIPT_NEW_TAI_LUE */ |
||
1333 | PACK ('B','u','g','i'), /* G_UNICODE_SCRIPT_BUGINESE */ |
||
1334 | PACK ('G','l','a','g'), /* G_UNICODE_SCRIPT_GLAGOLITIC */ |
||
1335 | PACK ('T','f','n','g'), /* G_UNICODE_SCRIPT_TIFINAGH */ |
||
1336 | PACK ('S','y','l','o'), /* G_UNICODE_SCRIPT_SYLOTI_NAGRI */ |
||
1337 | PACK ('X','p','e','o'), /* G_UNICODE_SCRIPT_OLD_PERSIAN */ |
||
1338 | PACK ('K','h','a','r'), /* G_UNICODE_SCRIPT_KHAROSHTHI */ |
||
1339 | |||
1340 | /* Unicode-5.0 additions */ |
||
1341 | PACK ('Z','z','z','z'), /* G_UNICODE_SCRIPT_UNKNOWN */ |
||
1342 | PACK ('B','a','l','i'), /* G_UNICODE_SCRIPT_BALINESE */ |
||
1343 | PACK ('X','s','u','x'), /* G_UNICODE_SCRIPT_CUNEIFORM */ |
||
1344 | PACK ('P','h','n','x'), /* G_UNICODE_SCRIPT_PHOENICIAN */ |
||
1345 | PACK ('P','h','a','g'), /* G_UNICODE_SCRIPT_PHAGS_PA */ |
||
1346 | PACK ('N','k','o','o'), /* G_UNICODE_SCRIPT_NKO */ |
||
1347 | |||
1348 | /* Unicode-5.1 additions */ |
||
1349 | PACK ('K','a','l','i'), /* G_UNICODE_SCRIPT_KAYAH_LI */ |
||
1350 | PACK ('L','e','p','c'), /* G_UNICODE_SCRIPT_LEPCHA */ |
||
1351 | PACK ('R','j','n','g'), /* G_UNICODE_SCRIPT_REJANG */ |
||
1352 | PACK ('S','u','n','d'), /* G_UNICODE_SCRIPT_SUNDANESE */ |
||
1353 | PACK ('S','a','u','r'), /* G_UNICODE_SCRIPT_SAURASHTRA */ |
||
1354 | PACK ('C','h','a','m'), /* G_UNICODE_SCRIPT_CHAM */ |
||
1355 | PACK ('O','l','c','k'), /* G_UNICODE_SCRIPT_OL_CHIKI */ |
||
1356 | PACK ('V','a','i','i'), /* G_UNICODE_SCRIPT_VAI */ |
||
1357 | PACK ('C','a','r','i'), /* G_UNICODE_SCRIPT_CARIAN */ |
||
1358 | PACK ('L','y','c','i'), /* G_UNICODE_SCRIPT_LYCIAN */ |
||
1359 | PACK ('L','y','d','i'), /* G_UNICODE_SCRIPT_LYDIAN */ |
||
1360 | |||
1361 | /* Unicode-5.2 additions */ |
||
1362 | PACK ('A','v','s','t'), /* G_UNICODE_SCRIPT_AVESTAN */ |
||
1363 | PACK ('B','a','m','u'), /* G_UNICODE_SCRIPT_BAMUM */ |
||
1364 | PACK ('E','g','y','p'), /* G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS */ |
||
1365 | PACK ('A','r','m','i'), /* G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC */ |
||
1366 | PACK ('P','h','l','i'), /* G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI */ |
||
1367 | PACK ('P','r','t','i'), /* G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN */ |
||
1368 | PACK ('J','a','v','a'), /* G_UNICODE_SCRIPT_JAVANESE */ |
||
1369 | PACK ('K','t','h','i'), /* G_UNICODE_SCRIPT_KAITHI */ |
||
1370 | PACK ('L','i','s','u'), /* G_UNICODE_SCRIPT_LISU */ |
||
1371 | PACK ('M','t','e','i'), /* G_UNICODE_SCRIPT_MEETEI_MAYEK */ |
||
1372 | PACK ('S','a','r','b'), /* G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN */ |
||
1373 | PACK ('O','r','k','h'), /* G_UNICODE_SCRIPT_OLD_TURKIC */ |
||
1374 | PACK ('S','a','m','r'), /* G_UNICODE_SCRIPT_SAMARITAN */ |
||
1375 | PACK ('L','a','n','a'), /* G_UNICODE_SCRIPT_TAI_THAM */ |
||
1376 | PACK ('T','a','v','t'), /* G_UNICODE_SCRIPT_TAI_VIET */ |
||
1377 | |||
1378 | /* Unicode-6.0 additions */ |
||
1379 | PACK ('B','a','t','k'), /* G_UNICODE_SCRIPT_BATAK */ |
||
1380 | PACK ('B','r','a','h'), /* G_UNICODE_SCRIPT_BRAHMI */ |
||
1381 | PACK ('M','a','n','d'), /* G_UNICODE_SCRIPT_MANDAIC */ |
||
1382 | |||
1383 | /* Unicode-6.1 additions */ |
||
1384 | PACK ('C','a','k','m'), /* G_UNICODE_SCRIPT_CHAKMA */ |
||
1385 | PACK ('M','e','r','c'), /* G_UNICODE_SCRIPT_MEROITIC_CURSIVE */ |
||
1386 | PACK ('M','e','r','o'), /* G_UNICODE_SCRIPT_MEROITIC_HIEROGLYPHS */ |
||
1387 | PACK ('P','l','r','d'), /* G_UNICODE_SCRIPT_MIAO */ |
||
1388 | PACK ('S','h','r','d'), /* G_UNICODE_SCRIPT_SHARADA */ |
||
1389 | PACK ('S','o','r','a'), /* G_UNICODE_SCRIPT_SORA_SOMPENG */ |
||
1390 | PACK ('T','a','k','r'), /* G_UNICODE_SCRIPT_TAKRI */ |
||
1391 | |||
1392 | /* Unicode 7.0 additions */ |
||
1393 | PACK ('B','a','s','s'), /* G_UNICODE_SCRIPT_BASSA_VAH */ |
||
1394 | PACK ('A','g','h','b'), /* G_UNICODE_SCRIPT_CAUCASIAN_ALBANIAN */ |
||
1395 | PACK ('D','u','p','l'), /* G_UNICODE_SCRIPT_DUPLOYAN */ |
||
1396 | PACK ('E','l','b','a'), /* G_UNICODE_SCRIPT_ELBASAN */ |
||
1397 | PACK ('G','r','a','n'), /* G_UNICODE_SCRIPT_GRANTHA */ |
||
1398 | PACK ('K','h','o','j'), /* G_UNICODE_SCRIPT_KHOJKI*/ |
||
1399 | PACK ('S','i','n','d'), /* G_UNICODE_SCRIPT_KHUDAWADI */ |
||
1400 | PACK ('L','i','n','a'), /* G_UNICODE_SCRIPT_LINEAR_A */ |
||
1401 | PACK ('M','a','h','j'), /* G_UNICODE_SCRIPT_MAHAJANI */ |
||
1402 | PACK ('M','a','n','u'), /* G_UNICODE_SCRIPT_MANICHAEAN */ |
||
1403 | PACK ('M','e','n','d'), /* G_UNICODE_SCRIPT_MENDE_KIKAKUI */ |
||
1404 | PACK ('M','o','d','i'), /* G_UNICODE_SCRIPT_MODI */ |
||
1405 | PACK ('M','r','o','o'), /* G_UNICODE_SCRIPT_MRO */ |
||
1406 | PACK ('N','b','a','t'), /* G_UNICODE_SCRIPT_NABATAEAN */ |
||
1407 | PACK ('N','a','r','b'), /* G_UNICODE_SCRIPT_OLD_NORTH_ARABIAN */ |
||
1408 | PACK ('P','e','r','m'), /* G_UNICODE_SCRIPT_OLD_PERMIC */ |
||
1409 | PACK ('H','m','n','g'), /* G_UNICODE_SCRIPT_PAHAWH_HMONG */ |
||
1410 | PACK ('P','a','l','m'), /* G_UNICODE_SCRIPT_PALMYRENE */ |
||
1411 | PACK ('P','a','u','c'), /* G_UNICODE_SCRIPT_PAU_CIN_HAU */ |
||
1412 | PACK ('P','h','l','p'), /* G_UNICODE_SCRIPT_PSALTER_PAHLAVI */ |
||
1413 | PACK ('S','i','d','d'), /* G_UNICODE_SCRIPT_SIDDHAM */ |
||
1414 | PACK ('T','i','r','h'), /* G_UNICODE_SCRIPT_TIRHUTA */ |
||
1415 | PACK ('W','a','r','a'), /* G_UNICODE_SCRIPT_WARANG_CITI */ |
||
1416 | |||
1417 | /* Unicode 8.0 additions */ |
||
1418 | PACK ('A','h','o','m'), /* G_UNICODE_SCRIPT_AHOM */ |
||
1419 | PACK ('H','l','u','w'), /* G_UNICODE_SCRIPT_ANATOLIAN_HIEROGLYPHS */ |
||
1420 | PACK ('H','a','t','r'), /* G_UNICODE_SCRIPT_HATRAN */ |
||
1421 | PACK ('M','u','l','t'), /* G_UNICODE_SCRIPT_MULTANI */ |
||
1422 | PACK ('H','u','n','g'), /* G_UNICODE_SCRIPT_OLD_HUNGARIAN */ |
||
1423 | PACK ('S','g','n','w'), /* G_UNICODE_SCRIPT_SIGNWRITING */ |
||
1424 | #undef PACK |
||
1425 | }; |
||
1426 | |||
1427 | /** |
||
1428 | * g_unicode_script_to_iso15924: |
||
1429 | * @script: a Unicode script |
||
1430 | * |
||
1431 | * Looks up the ISO 15924 code for @script. ISO 15924 assigns four-letter |
||
1432 | * codes to scripts. For example, the code for Arabic is 'Arab'. The |
||
1433 | * four letter codes are encoded as a @guint32 by this function in a |
||
1434 | * big-endian fashion. That is, the code returned for Arabic is |
||
1435 | * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). |
||
1436 | * |
||
1437 | * See |
||
1438 | * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) |
||
1439 | * for details. |
||
1440 | * |
||
1441 | * Returns: the ISO 15924 code for @script, encoded as an integer, |
||
1442 | * of zero if @script is %G_UNICODE_SCRIPT_INVALID_CODE or |
||
1443 | * ISO 15924 code 'Zzzz' (script code for UNKNOWN) if @script is not understood. |
||
1444 | * |
||
1445 | * Since: 2.30 |
||
1446 | */ |
||
1447 | guint32 |
||
1448 | g_unicode_script_to_iso15924 (GUnicodeScript script) |
||
1449 | { |
||
1450 | if (G_UNLIKELY (script == G_UNICODE_SCRIPT_INVALID_CODE)) |
||
1451 | return 0; |
||
1452 | |||
1453 | if (G_UNLIKELY (script < 0 || script >= (int) G_N_ELEMENTS (iso15924_tags))) |
||
1454 | return 0x5A7A7A7A; |
||
1455 | |||
1456 | return iso15924_tags[script]; |
||
1457 | } |
||
1458 | |||
1459 | /** |
||
1460 | * g_unicode_script_from_iso15924: |
||
1461 | * @iso15924: a Unicode script |
||
1462 | * |
||
1463 | * Looks up the Unicode script for @iso15924. ISO 15924 assigns four-letter |
||
1464 | * codes to scripts. For example, the code for Arabic is 'Arab'. |
||
1465 | * This function accepts four letter codes encoded as a @guint32 in a |
||
1466 | * big-endian fashion. That is, the code expected for Arabic is |
||
1467 | * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). |
||
1468 | * |
||
1469 | * See |
||
1470 | * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) |
||
1471 | * for details. |
||
1472 | * |
||
1473 | * Returns: the Unicode script for @iso15924, or |
||
1474 | * of %G_UNICODE_SCRIPT_INVALID_CODE if @iso15924 is zero and |
||
1475 | * %G_UNICODE_SCRIPT_UNKNOWN if @iso15924 is unknown. |
||
1476 | * |
||
1477 | * Since: 2.30 |
||
1478 | */ |
||
1479 | GUnicodeScript |
||
1480 | g_unicode_script_from_iso15924 (guint32 iso15924) |
||
1481 | { |
||
1482 | unsigned int i; |
||
1483 | |||
1484 | if (!iso15924) |
||
1485 | return G_UNICODE_SCRIPT_INVALID_CODE; |
||
1486 | |||
1487 | for (i = 0; i < G_N_ELEMENTS (iso15924_tags); i++) |
||
1488 | if (iso15924_tags[i] == iso15924) |
||
1489 | return (GUnicodeScript) i; |
||
1490 | |||
1491 | return G_UNICODE_SCRIPT_UNKNOWN; |
||
1492 | } |