nexmon – Blame information for rev 1
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
1 | office | 1 | /* decomp.c - Character decomposition. |
2 | * |
||
3 | * Copyright (C) 1999, 2000 Tom Tromey |
||
4 | * Copyright 2000 Red Hat, Inc. |
||
5 | * |
||
6 | * The Gnome Library is free software; you can redistribute it and/or |
||
7 | * modify it under the terms of the GNU Lesser General Public License as |
||
8 | * published by the Free Software Foundation; either version 2 of the |
||
9 | * License, or (at your option) any later version. |
||
10 | * |
||
11 | * The Gnome Library is distributed in the hope that it will be useful, |
||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
14 | * Lesser General Public License for more details. |
||
15 | * |
||
16 | * You should have received a copy of the GNU Lesser General Public |
||
17 | * License along with the Gnome Library; see the file COPYING.LIB. If not, |
||
18 | * see <http://www.gnu.org/licenses/>. |
||
19 | */ |
||
20 | |||
21 | /** |
||
22 | * SECTION:unicode |
||
23 | * @Title: Unicode Manipulation |
||
24 | * @Short_description: functions operating on Unicode characters and |
||
25 | * UTF-8 strings |
||
26 | * @See_also: g_locale_to_utf8(), g_locale_from_utf8() |
||
27 | * |
||
28 | * This section describes a number of functions for dealing with |
||
29 | * Unicode characters and strings. There are analogues of the |
||
30 | * traditional `ctype.h` character classification and case conversion |
||
31 | * functions, UTF-8 analogues of some string utility functions, |
||
32 | * functions to perform normalization, case conversion and collation |
||
33 | * on UTF-8 strings and finally functions to convert between the UTF-8, |
||
34 | * UTF-16 and UCS-4 encodings of Unicode. |
||
35 | * |
||
36 | * The implementations of the Unicode functions in GLib are based |
||
37 | * on the Unicode Character Data tables, which are available from |
||
38 | * [www.unicode.org](http://www.unicode.org/). |
||
39 | * GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1, |
||
40 | * GLib 2.12 supports Unicode 5.0, GLib 2.16.3 supports Unicode 5.1, |
||
41 | * GLib 2.30 supports Unicode 6.0. |
||
42 | */ |
||
43 | |||
44 | #include "config.h" |
||
45 | |||
46 | #include <stdlib.h> |
||
47 | |||
48 | #include "gunicode.h" |
||
49 | #include "gunidecomp.h" |
||
50 | #include "gmem.h" |
||
51 | #include "gunicomp.h" |
||
52 | #include "gunicodeprivate.h" |
||
53 | |||
54 | |||
55 | #define CC_PART1(Page, Char) \ |
||
56 | ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
||
57 | ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
||
58 | : (cclass_data[combining_class_table_part1[Page]][Char])) |
||
59 | |||
60 | #define CC_PART2(Page, Char) \ |
||
61 | ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
||
62 | ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
||
63 | : (cclass_data[combining_class_table_part2[Page]][Char])) |
||
64 | |||
65 | #define COMBINING_CLASS(Char) \ |
||
66 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
||
67 | ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \ |
||
68 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
||
69 | ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
||
70 | : 0)) |
||
71 | |||
72 | /** |
||
73 | * g_unichar_combining_class: |
||
74 | * @uc: a Unicode character |
||
75 | * |
||
76 | * Determines the canonical combining class of a Unicode character. |
||
77 | * |
||
78 | * Returns: the combining class of the character |
||
79 | * |
||
80 | * Since: 2.14 |
||
81 | **/ |
||
82 | gint |
||
83 | g_unichar_combining_class (gunichar uc) |
||
84 | { |
||
85 | return COMBINING_CLASS (uc); |
||
86 | } |
||
87 | |||
88 | /* constants for hangul syllable [de]composition */ |
||
89 | #define SBase 0xAC00 |
||
90 | #define LBase 0x1100 |
||
91 | #define VBase 0x1161 |
||
92 | #define TBase 0x11A7 |
||
93 | #define LCount 19 |
||
94 | #define VCount 21 |
||
95 | #define TCount 28 |
||
96 | #define NCount (VCount * TCount) |
||
97 | #define SCount (LCount * NCount) |
||
98 | |||
99 | /** |
||
100 | * g_unicode_canonical_ordering: |
||
101 | * @string: a UCS-4 encoded string. |
||
102 | * @len: the maximum length of @string to use. |
||
103 | * |
||
104 | * Computes the canonical ordering of a string in-place. |
||
105 | * This rearranges decomposed characters in the string |
||
106 | * according to their combining classes. See the Unicode |
||
107 | * manual for more information. |
||
108 | **/ |
||
109 | void |
||
110 | g_unicode_canonical_ordering (gunichar *string, |
||
111 | gsize len) |
||
112 | { |
||
113 | gsize i; |
||
114 | int swap = 1; |
||
115 | |||
116 | while (swap) |
||
117 | { |
||
118 | int last; |
||
119 | swap = 0; |
||
120 | last = COMBINING_CLASS (string[0]); |
||
121 | for (i = 0; i < len - 1; ++i) |
||
122 | { |
||
123 | int next = COMBINING_CLASS (string[i + 1]); |
||
124 | if (next != 0 && last > next) |
||
125 | { |
||
126 | gsize j; |
||
127 | /* Percolate item leftward through string. */ |
||
128 | for (j = i + 1; j > 0; --j) |
||
129 | { |
||
130 | gunichar t; |
||
131 | if (COMBINING_CLASS (string[j - 1]) <= next) |
||
132 | break; |
||
133 | t = string[j]; |
||
134 | string[j] = string[j - 1]; |
||
135 | string[j - 1] = t; |
||
136 | swap = 1; |
||
137 | } |
||
138 | /* We're re-entering the loop looking at the old |
||
139 | character again. */ |
||
140 | next = last; |
||
141 | } |
||
142 | last = next; |
||
143 | } |
||
144 | } |
||
145 | } |
||
146 | |||
147 | /* http://www.unicode.org/unicode/reports/tr15/#Hangul |
||
148 | * r should be null or have sufficient space. Calling with r == NULL will |
||
149 | * only calculate the result_len; however, a buffer with space for three |
||
150 | * characters will always be big enough. */ |
||
151 | static void |
||
152 | decompose_hangul (gunichar s, |
||
153 | gunichar *r, |
||
154 | gsize *result_len) |
||
155 | { |
||
156 | gint SIndex = s - SBase; |
||
157 | gint TIndex = SIndex % TCount; |
||
158 | |||
159 | if (r) |
||
160 | { |
||
161 | r[0] = LBase + SIndex / NCount; |
||
162 | r[1] = VBase + (SIndex % NCount) / TCount; |
||
163 | } |
||
164 | |||
165 | if (TIndex) |
||
166 | { |
||
167 | if (r) |
||
168 | r[2] = TBase + TIndex; |
||
169 | *result_len = 3; |
||
170 | } |
||
171 | else |
||
172 | *result_len = 2; |
||
173 | } |
||
174 | |||
175 | /* returns a pointer to a null-terminated UTF-8 string */ |
||
176 | static const gchar * |
||
177 | find_decomposition (gunichar ch, |
||
178 | gboolean compat) |
||
179 | { |
||
180 | int start = 0; |
||
181 | int end = G_N_ELEMENTS (decomp_table); |
||
182 | |||
183 | if (ch >= decomp_table[start].ch && |
||
184 | ch <= decomp_table[end - 1].ch) |
||
185 | { |
||
186 | while (TRUE) |
||
187 | { |
||
188 | int half = (start + end) / 2; |
||
189 | if (ch == decomp_table[half].ch) |
||
190 | { |
||
191 | int offset; |
||
192 | |||
193 | if (compat) |
||
194 | { |
||
195 | offset = decomp_table[half].compat_offset; |
||
196 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
||
197 | offset = decomp_table[half].canon_offset; |
||
198 | } |
||
199 | else |
||
200 | { |
||
201 | offset = decomp_table[half].canon_offset; |
||
202 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
||
203 | return NULL; |
||
204 | } |
||
205 | |||
206 | return &(decomp_expansion_string[offset]); |
||
207 | } |
||
208 | else if (half == start) |
||
209 | break; |
||
210 | else if (ch > decomp_table[half].ch) |
||
211 | start = half; |
||
212 | else |
||
213 | end = half; |
||
214 | } |
||
215 | } |
||
216 | |||
217 | return NULL; |
||
218 | } |
||
219 | |||
220 | /** |
||
221 | * g_unicode_canonical_decomposition: |
||
222 | * @ch: a Unicode character. |
||
223 | * @result_len: location to store the length of the return value. |
||
224 | * |
||
225 | * Computes the canonical decomposition of a Unicode character. |
||
226 | * |
||
227 | * Returns: a newly allocated string of Unicode characters. |
||
228 | * @result_len is set to the resulting length of the string. |
||
229 | * |
||
230 | * Deprecated: 2.30: Use the more flexible g_unichar_fully_decompose() |
||
231 | * instead. |
||
232 | **/ |
||
233 | gunichar * |
||
234 | g_unicode_canonical_decomposition (gunichar ch, |
||
235 | gsize *result_len) |
||
236 | { |
||
237 | const gchar *decomp; |
||
238 | const gchar *p; |
||
239 | gunichar *r; |
||
240 | |||
241 | /* Hangul syllable */ |
||
242 | if (ch >= SBase && ch < SBase + SCount) |
||
243 | { |
||
244 | decompose_hangul (ch, NULL, result_len); |
||
245 | r = g_malloc (*result_len * sizeof (gunichar)); |
||
246 | decompose_hangul (ch, r, result_len); |
||
247 | } |
||
248 | else if ((decomp = find_decomposition (ch, FALSE)) != NULL) |
||
249 | { |
||
250 | /* Found it. */ |
||
251 | int i; |
||
252 | |||
253 | *result_len = g_utf8_strlen (decomp, -1); |
||
254 | r = g_malloc (*result_len * sizeof (gunichar)); |
||
255 | |||
256 | for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++) |
||
257 | r[i] = g_utf8_get_char (p); |
||
258 | } |
||
259 | else |
||
260 | { |
||
261 | /* Not in our table. */ |
||
262 | r = g_malloc (sizeof (gunichar)); |
||
263 | *r = ch; |
||
264 | *result_len = 1; |
||
265 | } |
||
266 | |||
267 | return r; |
||
268 | } |
||
269 | |||
270 | /* L,V => LV and LV,T => LVT */ |
||
271 | static gboolean |
||
272 | combine_hangul (gunichar a, |
||
273 | gunichar b, |
||
274 | gunichar *result) |
||
275 | { |
||
276 | gint LIndex = a - LBase; |
||
277 | gint SIndex = a - SBase; |
||
278 | |||
279 | gint VIndex = b - VBase; |
||
280 | gint TIndex = b - TBase; |
||
281 | |||
282 | if (0 <= LIndex && LIndex < LCount |
||
283 | && 0 <= VIndex && VIndex < VCount) |
||
284 | { |
||
285 | *result = SBase + (LIndex * VCount + VIndex) * TCount; |
||
286 | return TRUE; |
||
287 | } |
||
288 | else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0 |
||
289 | && 0 < TIndex && TIndex < TCount) |
||
290 | { |
||
291 | *result = a + TIndex; |
||
292 | return TRUE; |
||
293 | } |
||
294 | |||
295 | return FALSE; |
||
296 | } |
||
297 | |||
298 | #define CI(Page, Char) \ |
||
299 | ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
||
300 | ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
||
301 | : (compose_data[compose_table[Page]][Char])) |
||
302 | |||
303 | #define COMPOSE_INDEX(Char) \ |
||
304 | (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) |
||
305 | |||
306 | static gboolean |
||
307 | combine (gunichar a, |
||
308 | gunichar b, |
||
309 | gunichar *result) |
||
310 | { |
||
311 | gushort index_a, index_b; |
||
312 | |||
313 | if (combine_hangul (a, b, result)) |
||
314 | return TRUE; |
||
315 | |||
316 | index_a = COMPOSE_INDEX(a); |
||
317 | |||
318 | if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) |
||
319 | { |
||
320 | if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) |
||
321 | { |
||
322 | *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; |
||
323 | return TRUE; |
||
324 | } |
||
325 | else |
||
326 | return FALSE; |
||
327 | } |
||
328 | |||
329 | index_b = COMPOSE_INDEX(b); |
||
330 | |||
331 | if (index_b >= COMPOSE_SECOND_SINGLE_START) |
||
332 | { |
||
333 | if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) |
||
334 | { |
||
335 | *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; |
||
336 | return TRUE; |
||
337 | } |
||
338 | else |
||
339 | return FALSE; |
||
340 | } |
||
341 | |||
342 | if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START && |
||
343 | index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START) |
||
344 | { |
||
345 | gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START]; |
||
346 | |||
347 | if (res) |
||
348 | { |
||
349 | *result = res; |
||
350 | return TRUE; |
||
351 | } |
||
352 | } |
||
353 | |||
354 | return FALSE; |
||
355 | } |
||
356 | |||
357 | gunichar * |
||
358 | _g_utf8_normalize_wc (const gchar *str, |
||
359 | gssize max_len, |
||
360 | GNormalizeMode mode) |
||
361 | { |
||
362 | gsize n_wc; |
||
363 | gunichar *wc_buffer; |
||
364 | const char *p; |
||
365 | gsize last_start; |
||
366 | gboolean do_compat = (mode == G_NORMALIZE_NFKC || |
||
367 | mode == G_NORMALIZE_NFKD); |
||
368 | gboolean do_compose = (mode == G_NORMALIZE_NFC || |
||
369 | mode == G_NORMALIZE_NFKC); |
||
370 | |||
371 | n_wc = 0; |
||
372 | p = str; |
||
373 | while ((max_len < 0 || p < str + max_len) && *p) |
||
374 | { |
||
375 | const gchar *decomp; |
||
376 | gunichar wc = g_utf8_get_char (p); |
||
377 | |||
378 | if (wc >= SBase && wc < SBase + SCount) |
||
379 | { |
||
380 | gsize result_len; |
||
381 | decompose_hangul (wc, NULL, &result_len); |
||
382 | n_wc += result_len; |
||
383 | } |
||
384 | else |
||
385 | { |
||
386 | decomp = find_decomposition (wc, do_compat); |
||
387 | |||
388 | if (decomp) |
||
389 | n_wc += g_utf8_strlen (decomp, -1); |
||
390 | else |
||
391 | n_wc++; |
||
392 | } |
||
393 | |||
394 | p = g_utf8_next_char (p); |
||
395 | } |
||
396 | |||
397 | wc_buffer = g_new (gunichar, n_wc + 1); |
||
398 | |||
399 | last_start = 0; |
||
400 | n_wc = 0; |
||
401 | p = str; |
||
402 | while ((max_len < 0 || p < str + max_len) && *p) |
||
403 | { |
||
404 | gunichar wc = g_utf8_get_char (p); |
||
405 | const gchar *decomp; |
||
406 | int cc; |
||
407 | gsize old_n_wc = n_wc; |
||
408 | |||
409 | if (wc >= SBase && wc < SBase + SCount) |
||
410 | { |
||
411 | gsize result_len; |
||
412 | decompose_hangul (wc, wc_buffer + n_wc, &result_len); |
||
413 | n_wc += result_len; |
||
414 | } |
||
415 | else |
||
416 | { |
||
417 | decomp = find_decomposition (wc, do_compat); |
||
418 | |||
419 | if (decomp) |
||
420 | { |
||
421 | const char *pd; |
||
422 | for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) |
||
423 | wc_buffer[n_wc++] = g_utf8_get_char (pd); |
||
424 | } |
||
425 | else |
||
426 | wc_buffer[n_wc++] = wc; |
||
427 | } |
||
428 | |||
429 | if (n_wc > 0) |
||
430 | { |
||
431 | cc = COMBINING_CLASS (wc_buffer[old_n_wc]); |
||
432 | |||
433 | if (cc == 0) |
||
434 | { |
||
435 | g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); |
||
436 | last_start = old_n_wc; |
||
437 | } |
||
438 | } |
||
439 | |||
440 | p = g_utf8_next_char (p); |
||
441 | } |
||
442 | |||
443 | if (n_wc > 0) |
||
444 | { |
||
445 | g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); |
||
446 | last_start = n_wc; |
||
447 | } |
||
448 | |||
449 | wc_buffer[n_wc] = 0; |
||
450 | |||
451 | /* All decomposed and reordered */ |
||
452 | |||
453 | if (do_compose && n_wc > 0) |
||
454 | { |
||
455 | gsize i, j; |
||
456 | int last_cc = 0; |
||
457 | last_start = 0; |
||
458 | |||
459 | for (i = 0; i < n_wc; i++) |
||
460 | { |
||
461 | int cc = COMBINING_CLASS (wc_buffer[i]); |
||
462 | |||
463 | if (i > 0 && |
||
464 | (last_cc == 0 || last_cc < cc) && |
||
465 | combine (wc_buffer[last_start], wc_buffer[i], |
||
466 | &wc_buffer[last_start])) |
||
467 | { |
||
468 | for (j = i + 1; j < n_wc; j++) |
||
469 | wc_buffer[j-1] = wc_buffer[j]; |
||
470 | n_wc--; |
||
471 | i--; |
||
472 | |||
473 | if (i == last_start) |
||
474 | last_cc = 0; |
||
475 | else |
||
476 | last_cc = COMBINING_CLASS (wc_buffer[i-1]); |
||
477 | |||
478 | continue; |
||
479 | } |
||
480 | |||
481 | if (cc == 0) |
||
482 | last_start = i; |
||
483 | |||
484 | last_cc = cc; |
||
485 | } |
||
486 | } |
||
487 | |||
488 | wc_buffer[n_wc] = 0; |
||
489 | |||
490 | return wc_buffer; |
||
491 | } |
||
492 | |||
493 | /** |
||
494 | * g_utf8_normalize: |
||
495 | * @str: a UTF-8 encoded string. |
||
496 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
||
497 | * @mode: the type of normalization to perform. |
||
498 | * |
||
499 | * Converts a string into canonical form, standardizing |
||
500 | * such issues as whether a character with an accent |
||
501 | * is represented as a base character and combining |
||
502 | * accent or as a single precomposed character. The |
||
503 | * string has to be valid UTF-8, otherwise %NULL is |
||
504 | * returned. You should generally call g_utf8_normalize() |
||
505 | * before comparing two Unicode strings. |
||
506 | * |
||
507 | * The normalization mode %G_NORMALIZE_DEFAULT only |
||
508 | * standardizes differences that do not affect the |
||
509 | * text content, such as the above-mentioned accent |
||
510 | * representation. %G_NORMALIZE_ALL also standardizes |
||
511 | * the "compatibility" characters in Unicode, such |
||
512 | * as SUPERSCRIPT THREE to the standard forms |
||
513 | * (in this case DIGIT THREE). Formatting information |
||
514 | * may be lost but for most text operations such |
||
515 | * characters should be considered the same. |
||
516 | * |
||
517 | * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE |
||
518 | * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, |
||
519 | * but returned a result with composed forms rather |
||
520 | * than a maximally decomposed form. This is often |
||
521 | * useful if you intend to convert the string to |
||
522 | * a legacy encoding or pass it to a system with |
||
523 | * less capable Unicode handling. |
||
524 | * |
||
525 | * Returns: a newly allocated string, that is the |
||
526 | * normalized form of @str, or %NULL if @str is not |
||
527 | * valid UTF-8. |
||
528 | **/ |
||
529 | gchar * |
||
530 | g_utf8_normalize (const gchar *str, |
||
531 | gssize len, |
||
532 | GNormalizeMode mode) |
||
533 | { |
||
534 | gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); |
||
535 | gchar *result; |
||
536 | |||
537 | result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL); |
||
538 | g_free (result_wc); |
||
539 | |||
540 | return result; |
||
541 | } |
||
542 | |||
543 | static gboolean |
||
544 | decompose_hangul_step (gunichar ch, |
||
545 | gunichar *a, |
||
546 | gunichar *b) |
||
547 | { |
||
548 | gint SIndex, TIndex; |
||
549 | |||
550 | if (ch < SBase || ch >= SBase + SCount) |
||
551 | return FALSE; /* not a hangul syllable */ |
||
552 | |||
553 | SIndex = ch - SBase; |
||
554 | TIndex = SIndex % TCount; |
||
555 | |||
556 | if (TIndex) |
||
557 | { |
||
558 | /* split LVT -> LV,T */ |
||
559 | *a = ch - TIndex; |
||
560 | *b = TBase + TIndex; |
||
561 | } |
||
562 | else |
||
563 | { |
||
564 | /* split LV -> L,V */ |
||
565 | *a = LBase + SIndex / NCount; |
||
566 | *b = VBase + (SIndex % NCount) / TCount; |
||
567 | } |
||
568 | |||
569 | return TRUE; |
||
570 | } |
||
571 | |||
572 | /** |
||
573 | * g_unichar_decompose: |
||
574 | * @ch: a Unicode character |
||
575 | * @a: return location for the first component of @ch |
||
576 | * @b: return location for the second component of @ch |
||
577 | * |
||
578 | * Performs a single decomposition step of the |
||
579 | * Unicode canonical decomposition algorithm. |
||
580 | * |
||
581 | * This function does not include compatibility |
||
582 | * decompositions. It does, however, include algorithmic |
||
583 | * Hangul Jamo decomposition, as well as 'singleton' |
||
584 | * decompositions which replace a character by a single |
||
585 | * other character. In the case of singletons *@b will |
||
586 | * be set to zero. |
||
587 | * |
||
588 | * If @ch is not decomposable, *@a is set to @ch and *@b |
||
589 | * is set to zero. |
||
590 | * |
||
591 | * Note that the way Unicode decomposition pairs are |
||
592 | * defined, it is guaranteed that @b would not decompose |
||
593 | * further, but @a may itself decompose. To get the full |
||
594 | * canonical decomposition for @ch, one would need to |
||
595 | * recursively call this function on @a. Or use |
||
596 | * g_unichar_fully_decompose(). |
||
597 | * |
||
598 | * See |
||
599 | * [UAX#15](http://unicode.org/reports/tr15/) |
||
600 | * for details. |
||
601 | * |
||
602 | * Returns: %TRUE if the character could be decomposed |
||
603 | * |
||
604 | * Since: 2.30 |
||
605 | */ |
||
606 | gboolean |
||
607 | g_unichar_decompose (gunichar ch, |
||
608 | gunichar *a, |
||
609 | gunichar *b) |
||
610 | { |
||
611 | gint start = 0; |
||
612 | gint end = G_N_ELEMENTS (decomp_step_table); |
||
613 | |||
614 | if (decompose_hangul_step (ch, a, b)) |
||
615 | return TRUE; |
||
616 | |||
617 | /* TODO use bsearch() */ |
||
618 | if (ch >= decomp_step_table[start].ch && |
||
619 | ch <= decomp_step_table[end - 1].ch) |
||
620 | { |
||
621 | while (TRUE) |
||
622 | { |
||
623 | gint half = (start + end) / 2; |
||
624 | const decomposition_step *p = &(decomp_step_table[half]); |
||
625 | if (ch == p->ch) |
||
626 | { |
||
627 | *a = p->a; |
||
628 | *b = p->b; |
||
629 | return TRUE; |
||
630 | } |
||
631 | else if (half == start) |
||
632 | break; |
||
633 | else if (ch > p->ch) |
||
634 | start = half; |
||
635 | else |
||
636 | end = half; |
||
637 | } |
||
638 | } |
||
639 | |||
640 | *a = ch; |
||
641 | *b = 0; |
||
642 | |||
643 | return FALSE; |
||
644 | } |
||
645 | |||
646 | /** |
||
647 | * g_unichar_compose: |
||
648 | * @a: a Unicode character |
||
649 | * @b: a Unicode character |
||
650 | * @ch: return location for the composed character |
||
651 | * |
||
652 | * Performs a single composition step of the |
||
653 | * Unicode canonical composition algorithm. |
||
654 | * |
||
655 | * This function includes algorithmic Hangul Jamo composition, |
||
656 | * but it is not exactly the inverse of g_unichar_decompose(). |
||
657 | * No composition can have either of @a or @b equal to zero. |
||
658 | * To be precise, this function composes if and only if |
||
659 | * there exists a Primary Composite P which is canonically |
||
660 | * equivalent to the sequence <@a,@b>. See the Unicode |
||
661 | * Standard for the definition of Primary Composite. |
||
662 | * |
||
663 | * If @a and @b do not compose a new character, @ch is set to zero. |
||
664 | * |
||
665 | * See |
||
666 | * [UAX#15](http://unicode.org/reports/tr15/) |
||
667 | * for details. |
||
668 | * |
||
669 | * Returns: %TRUE if the characters could be composed |
||
670 | * |
||
671 | * Since: 2.30 |
||
672 | */ |
||
673 | gboolean |
||
674 | g_unichar_compose (gunichar a, |
||
675 | gunichar b, |
||
676 | gunichar *ch) |
||
677 | { |
||
678 | if (combine (a, b, ch)) |
||
679 | return TRUE; |
||
680 | |||
681 | *ch = 0; |
||
682 | return FALSE; |
||
683 | } |
||
684 | |||
685 | /** |
||
686 | * g_unichar_fully_decompose: |
||
687 | * @ch: a Unicode character. |
||
688 | * @compat: whether perform canonical or compatibility decomposition |
||
689 | * @result: (allow-none): location to store decomposed result, or %NULL |
||
690 | * @result_len: length of @result |
||
691 | * |
||
692 | * Computes the canonical or compatibility decomposition of a |
||
693 | * Unicode character. For compatibility decomposition, |
||
694 | * pass %TRUE for @compat; for canonical decomposition |
||
695 | * pass %FALSE for @compat. |
||
696 | * |
||
697 | * The decomposed sequence is placed in @result. Only up to |
||
698 | * @result_len characters are written into @result. The length |
||
699 | * of the full decomposition (irrespective of @result_len) is |
||
700 | * returned by the function. For canonical decomposition, |
||
701 | * currently all decompositions are of length at most 4, but |
||
702 | * this may change in the future (very unlikely though). |
||
703 | * At any rate, Unicode does guarantee that a buffer of length |
||
704 | * 18 is always enough for both compatibility and canonical |
||
705 | * decompositions, so that is the size recommended. This is provided |
||
706 | * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH. |
||
707 | * |
||
708 | * See |
||
709 | * [UAX#15](http://unicode.org/reports/tr15/) |
||
710 | * for details. |
||
711 | * |
||
712 | * Returns: the length of the full decomposition. |
||
713 | * |
||
714 | * Since: 2.30 |
||
715 | **/ |
||
716 | gsize |
||
717 | g_unichar_fully_decompose (gunichar ch, |
||
718 | gboolean compat, |
||
719 | gunichar *result, |
||
720 | gsize result_len) |
||
721 | { |
||
722 | const gchar *decomp; |
||
723 | const gchar *p; |
||
724 | |||
725 | /* Hangul syllable */ |
||
726 | if (ch >= SBase && ch < SBase + SCount) |
||
727 | { |
||
728 | gsize len, i; |
||
729 | gunichar buffer[3]; |
||
730 | decompose_hangul (ch, result ? buffer : NULL, &len); |
||
731 | if (result) |
||
732 | for (i = 0; i < len && i < result_len; i++) |
||
733 | result[i] = buffer[i]; |
||
734 | return len; |
||
735 | } |
||
736 | else if ((decomp = find_decomposition (ch, compat)) != NULL) |
||
737 | { |
||
738 | /* Found it. */ |
||
739 | gsize len, i; |
||
740 | |||
741 | len = g_utf8_strlen (decomp, -1); |
||
742 | |||
743 | for (p = decomp, i = 0; i < len && i < result_len; p = g_utf8_next_char (p), i++) |
||
744 | result[i] = g_utf8_get_char (p); |
||
745 | |||
746 | return len; |
||
747 | } |
||
748 | |||
749 | /* Does not decompose */ |
||
750 | if (result && result_len >= 1) |
||
751 | *result = ch; |
||
752 | return 1; |
||
753 | } |