nexmon – Blame information for rev 1

Subversion Repositories:
Rev:
Rev Author Line No. Line
1 office 1 /* decomp.c - Character decomposition.
2 *
3 * Copyright (C) 1999, 2000 Tom Tromey
4 * Copyright 2000 Red Hat, Inc.
5 *
6 * The Gnome Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The Gnome Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with the Gnome Library; see the file COPYING.LIB. If not,
18 * see <http://www.gnu.org/licenses/>.
19 */
20  
21 /**
22 * SECTION:unicode
23 * @Title: Unicode Manipulation
24 * @Short_description: functions operating on Unicode characters and
25 * UTF-8 strings
26 * @See_also: g_locale_to_utf8(), g_locale_from_utf8()
27 *
28 * This section describes a number of functions for dealing with
29 * Unicode characters and strings. There are analogues of the
30 * traditional `ctype.h` character classification and case conversion
31 * functions, UTF-8 analogues of some string utility functions,
32 * functions to perform normalization, case conversion and collation
33 * on UTF-8 strings and finally functions to convert between the UTF-8,
34 * UTF-16 and UCS-4 encodings of Unicode.
35 *
36 * The implementations of the Unicode functions in GLib are based
37 * on the Unicode Character Data tables, which are available from
38 * [www.unicode.org](http://www.unicode.org/).
39 * GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1,
40 * GLib 2.12 supports Unicode 5.0, GLib 2.16.3 supports Unicode 5.1,
41 * GLib 2.30 supports Unicode 6.0.
42 */
43  
44 #include "config.h"
45  
46 #include <stdlib.h>
47  
48 #include "gunicode.h"
49 #include "gunidecomp.h"
50 #include "gmem.h"
51 #include "gunicomp.h"
52 #include "gunicodeprivate.h"
53  
54  
55 #define CC_PART1(Page, Char) \
56 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
57 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
58 : (cclass_data[combining_class_table_part1[Page]][Char]))
59  
60 #define CC_PART2(Page, Char) \
61 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
62 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
63 : (cclass_data[combining_class_table_part2[Page]][Char]))
64  
65 #define COMBINING_CLASS(Char) \
66 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
67 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
68 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
69 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
70 : 0))
71  
72 /**
73 * g_unichar_combining_class:
74 * @uc: a Unicode character
75 *
76 * Determines the canonical combining class of a Unicode character.
77 *
78 * Returns: the combining class of the character
79 *
80 * Since: 2.14
81 **/
82 gint
83 g_unichar_combining_class (gunichar uc)
84 {
85 return COMBINING_CLASS (uc);
86 }
87  
88 /* constants for hangul syllable [de]composition */
89 #define SBase 0xAC00
90 #define LBase 0x1100
91 #define VBase 0x1161
92 #define TBase 0x11A7
93 #define LCount 19
94 #define VCount 21
95 #define TCount 28
96 #define NCount (VCount * TCount)
97 #define SCount (LCount * NCount)
98  
99 /**
100 * g_unicode_canonical_ordering:
101 * @string: a UCS-4 encoded string.
102 * @len: the maximum length of @string to use.
103 *
104 * Computes the canonical ordering of a string in-place.
105 * This rearranges decomposed characters in the string
106 * according to their combining classes. See the Unicode
107 * manual for more information.
108 **/
109 void
110 g_unicode_canonical_ordering (gunichar *string,
111 gsize len)
112 {
113 gsize i;
114 int swap = 1;
115  
116 while (swap)
117 {
118 int last;
119 swap = 0;
120 last = COMBINING_CLASS (string[0]);
121 for (i = 0; i < len - 1; ++i)
122 {
123 int next = COMBINING_CLASS (string[i + 1]);
124 if (next != 0 && last > next)
125 {
126 gsize j;
127 /* Percolate item leftward through string. */
128 for (j = i + 1; j > 0; --j)
129 {
130 gunichar t;
131 if (COMBINING_CLASS (string[j - 1]) <= next)
132 break;
133 t = string[j];
134 string[j] = string[j - 1];
135 string[j - 1] = t;
136 swap = 1;
137 }
138 /* We're re-entering the loop looking at the old
139 character again. */
140 next = last;
141 }
142 last = next;
143 }
144 }
145 }
146  
147 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
148 * r should be null or have sufficient space. Calling with r == NULL will
149 * only calculate the result_len; however, a buffer with space for three
150 * characters will always be big enough. */
151 static void
152 decompose_hangul (gunichar s,
153 gunichar *r,
154 gsize *result_len)
155 {
156 gint SIndex = s - SBase;
157 gint TIndex = SIndex % TCount;
158  
159 if (r)
160 {
161 r[0] = LBase + SIndex / NCount;
162 r[1] = VBase + (SIndex % NCount) / TCount;
163 }
164  
165 if (TIndex)
166 {
167 if (r)
168 r[2] = TBase + TIndex;
169 *result_len = 3;
170 }
171 else
172 *result_len = 2;
173 }
174  
175 /* returns a pointer to a null-terminated UTF-8 string */
176 static const gchar *
177 find_decomposition (gunichar ch,
178 gboolean compat)
179 {
180 int start = 0;
181 int end = G_N_ELEMENTS (decomp_table);
182  
183 if (ch >= decomp_table[start].ch &&
184 ch <= decomp_table[end - 1].ch)
185 {
186 while (TRUE)
187 {
188 int half = (start + end) / 2;
189 if (ch == decomp_table[half].ch)
190 {
191 int offset;
192  
193 if (compat)
194 {
195 offset = decomp_table[half].compat_offset;
196 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
197 offset = decomp_table[half].canon_offset;
198 }
199 else
200 {
201 offset = decomp_table[half].canon_offset;
202 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
203 return NULL;
204 }
205  
206 return &(decomp_expansion_string[offset]);
207 }
208 else if (half == start)
209 break;
210 else if (ch > decomp_table[half].ch)
211 start = half;
212 else
213 end = half;
214 }
215 }
216  
217 return NULL;
218 }
219  
220 /**
221 * g_unicode_canonical_decomposition:
222 * @ch: a Unicode character.
223 * @result_len: location to store the length of the return value.
224 *
225 * Computes the canonical decomposition of a Unicode character.
226 *
227 * Returns: a newly allocated string of Unicode characters.
228 * @result_len is set to the resulting length of the string.
229 *
230 * Deprecated: 2.30: Use the more flexible g_unichar_fully_decompose()
231 * instead.
232 **/
233 gunichar *
234 g_unicode_canonical_decomposition (gunichar ch,
235 gsize *result_len)
236 {
237 const gchar *decomp;
238 const gchar *p;
239 gunichar *r;
240  
241 /* Hangul syllable */
242 if (ch >= SBase && ch < SBase + SCount)
243 {
244 decompose_hangul (ch, NULL, result_len);
245 r = g_malloc (*result_len * sizeof (gunichar));
246 decompose_hangul (ch, r, result_len);
247 }
248 else if ((decomp = find_decomposition (ch, FALSE)) != NULL)
249 {
250 /* Found it. */
251 int i;
252  
253 *result_len = g_utf8_strlen (decomp, -1);
254 r = g_malloc (*result_len * sizeof (gunichar));
255  
256 for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
257 r[i] = g_utf8_get_char (p);
258 }
259 else
260 {
261 /* Not in our table. */
262 r = g_malloc (sizeof (gunichar));
263 *r = ch;
264 *result_len = 1;
265 }
266  
267 return r;
268 }
269  
270 /* L,V => LV and LV,T => LVT */
271 static gboolean
272 combine_hangul (gunichar a,
273 gunichar b,
274 gunichar *result)
275 {
276 gint LIndex = a - LBase;
277 gint SIndex = a - SBase;
278  
279 gint VIndex = b - VBase;
280 gint TIndex = b - TBase;
281  
282 if (0 <= LIndex && LIndex < LCount
283 && 0 <= VIndex && VIndex < VCount)
284 {
285 *result = SBase + (LIndex * VCount + VIndex) * TCount;
286 return TRUE;
287 }
288 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
289 && 0 < TIndex && TIndex < TCount)
290 {
291 *result = a + TIndex;
292 return TRUE;
293 }
294  
295 return FALSE;
296 }
297  
298 #define CI(Page, Char) \
299 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
300 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
301 : (compose_data[compose_table[Page]][Char]))
302  
303 #define COMPOSE_INDEX(Char) \
304 (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
305  
306 static gboolean
307 combine (gunichar a,
308 gunichar b,
309 gunichar *result)
310 {
311 gushort index_a, index_b;
312  
313 if (combine_hangul (a, b, result))
314 return TRUE;
315  
316 index_a = COMPOSE_INDEX(a);
317  
318 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
319 {
320 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
321 {
322 *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
323 return TRUE;
324 }
325 else
326 return FALSE;
327 }
328  
329 index_b = COMPOSE_INDEX(b);
330  
331 if (index_b >= COMPOSE_SECOND_SINGLE_START)
332 {
333 if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
334 {
335 *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
336 return TRUE;
337 }
338 else
339 return FALSE;
340 }
341  
342 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
343 index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START)
344 {
345 gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
346  
347 if (res)
348 {
349 *result = res;
350 return TRUE;
351 }
352 }
353  
354 return FALSE;
355 }
356  
357 gunichar *
358 _g_utf8_normalize_wc (const gchar *str,
359 gssize max_len,
360 GNormalizeMode mode)
361 {
362 gsize n_wc;
363 gunichar *wc_buffer;
364 const char *p;
365 gsize last_start;
366 gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
367 mode == G_NORMALIZE_NFKD);
368 gboolean do_compose = (mode == G_NORMALIZE_NFC ||
369 mode == G_NORMALIZE_NFKC);
370  
371 n_wc = 0;
372 p = str;
373 while ((max_len < 0 || p < str + max_len) && *p)
374 {
375 const gchar *decomp;
376 gunichar wc = g_utf8_get_char (p);
377  
378 if (wc >= SBase && wc < SBase + SCount)
379 {
380 gsize result_len;
381 decompose_hangul (wc, NULL, &result_len);
382 n_wc += result_len;
383 }
384 else
385 {
386 decomp = find_decomposition (wc, do_compat);
387  
388 if (decomp)
389 n_wc += g_utf8_strlen (decomp, -1);
390 else
391 n_wc++;
392 }
393  
394 p = g_utf8_next_char (p);
395 }
396  
397 wc_buffer = g_new (gunichar, n_wc + 1);
398  
399 last_start = 0;
400 n_wc = 0;
401 p = str;
402 while ((max_len < 0 || p < str + max_len) && *p)
403 {
404 gunichar wc = g_utf8_get_char (p);
405 const gchar *decomp;
406 int cc;
407 gsize old_n_wc = n_wc;
408  
409 if (wc >= SBase && wc < SBase + SCount)
410 {
411 gsize result_len;
412 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
413 n_wc += result_len;
414 }
415 else
416 {
417 decomp = find_decomposition (wc, do_compat);
418  
419 if (decomp)
420 {
421 const char *pd;
422 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
423 wc_buffer[n_wc++] = g_utf8_get_char (pd);
424 }
425 else
426 wc_buffer[n_wc++] = wc;
427 }
428  
429 if (n_wc > 0)
430 {
431 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
432  
433 if (cc == 0)
434 {
435 g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
436 last_start = old_n_wc;
437 }
438 }
439  
440 p = g_utf8_next_char (p);
441 }
442  
443 if (n_wc > 0)
444 {
445 g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
446 last_start = n_wc;
447 }
448  
449 wc_buffer[n_wc] = 0;
450  
451 /* All decomposed and reordered */
452  
453 if (do_compose && n_wc > 0)
454 {
455 gsize i, j;
456 int last_cc = 0;
457 last_start = 0;
458  
459 for (i = 0; i < n_wc; i++)
460 {
461 int cc = COMBINING_CLASS (wc_buffer[i]);
462  
463 if (i > 0 &&
464 (last_cc == 0 || last_cc < cc) &&
465 combine (wc_buffer[last_start], wc_buffer[i],
466 &wc_buffer[last_start]))
467 {
468 for (j = i + 1; j < n_wc; j++)
469 wc_buffer[j-1] = wc_buffer[j];
470 n_wc--;
471 i--;
472  
473 if (i == last_start)
474 last_cc = 0;
475 else
476 last_cc = COMBINING_CLASS (wc_buffer[i-1]);
477  
478 continue;
479 }
480  
481 if (cc == 0)
482 last_start = i;
483  
484 last_cc = cc;
485 }
486 }
487  
488 wc_buffer[n_wc] = 0;
489  
490 return wc_buffer;
491 }
492  
493 /**
494 * g_utf8_normalize:
495 * @str: a UTF-8 encoded string.
496 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
497 * @mode: the type of normalization to perform.
498 *
499 * Converts a string into canonical form, standardizing
500 * such issues as whether a character with an accent
501 * is represented as a base character and combining
502 * accent or as a single precomposed character. The
503 * string has to be valid UTF-8, otherwise %NULL is
504 * returned. You should generally call g_utf8_normalize()
505 * before comparing two Unicode strings.
506 *
507 * The normalization mode %G_NORMALIZE_DEFAULT only
508 * standardizes differences that do not affect the
509 * text content, such as the above-mentioned accent
510 * representation. %G_NORMALIZE_ALL also standardizes
511 * the "compatibility" characters in Unicode, such
512 * as SUPERSCRIPT THREE to the standard forms
513 * (in this case DIGIT THREE). Formatting information
514 * may be lost but for most text operations such
515 * characters should be considered the same.
516 *
517 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
518 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
519 * but returned a result with composed forms rather
520 * than a maximally decomposed form. This is often
521 * useful if you intend to convert the string to
522 * a legacy encoding or pass it to a system with
523 * less capable Unicode handling.
524 *
525 * Returns: a newly allocated string, that is the
526 * normalized form of @str, or %NULL if @str is not
527 * valid UTF-8.
528 **/
529 gchar *
530 g_utf8_normalize (const gchar *str,
531 gssize len,
532 GNormalizeMode mode)
533 {
534 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
535 gchar *result;
536  
537 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
538 g_free (result_wc);
539  
540 return result;
541 }
542  
543 static gboolean
544 decompose_hangul_step (gunichar ch,
545 gunichar *a,
546 gunichar *b)
547 {
548 gint SIndex, TIndex;
549  
550 if (ch < SBase || ch >= SBase + SCount)
551 return FALSE; /* not a hangul syllable */
552  
553 SIndex = ch - SBase;
554 TIndex = SIndex % TCount;
555  
556 if (TIndex)
557 {
558 /* split LVT -> LV,T */
559 *a = ch - TIndex;
560 *b = TBase + TIndex;
561 }
562 else
563 {
564 /* split LV -> L,V */
565 *a = LBase + SIndex / NCount;
566 *b = VBase + (SIndex % NCount) / TCount;
567 }
568  
569 return TRUE;
570 }
571  
572 /**
573 * g_unichar_decompose:
574 * @ch: a Unicode character
575 * @a: return location for the first component of @ch
576 * @b: return location for the second component of @ch
577 *
578 * Performs a single decomposition step of the
579 * Unicode canonical decomposition algorithm.
580 *
581 * This function does not include compatibility
582 * decompositions. It does, however, include algorithmic
583 * Hangul Jamo decomposition, as well as 'singleton'
584 * decompositions which replace a character by a single
585 * other character. In the case of singletons *@b will
586 * be set to zero.
587 *
588 * If @ch is not decomposable, *@a is set to @ch and *@b
589 * is set to zero.
590 *
591 * Note that the way Unicode decomposition pairs are
592 * defined, it is guaranteed that @b would not decompose
593 * further, but @a may itself decompose. To get the full
594 * canonical decomposition for @ch, one would need to
595 * recursively call this function on @a. Or use
596 * g_unichar_fully_decompose().
597 *
598 * See
599 * [UAX#15](http://unicode.org/reports/tr15/)
600 * for details.
601 *
602 * Returns: %TRUE if the character could be decomposed
603 *
604 * Since: 2.30
605 */
606 gboolean
607 g_unichar_decompose (gunichar ch,
608 gunichar *a,
609 gunichar *b)
610 {
611 gint start = 0;
612 gint end = G_N_ELEMENTS (decomp_step_table);
613  
614 if (decompose_hangul_step (ch, a, b))
615 return TRUE;
616  
617 /* TODO use bsearch() */
618 if (ch >= decomp_step_table[start].ch &&
619 ch <= decomp_step_table[end - 1].ch)
620 {
621 while (TRUE)
622 {
623 gint half = (start + end) / 2;
624 const decomposition_step *p = &(decomp_step_table[half]);
625 if (ch == p->ch)
626 {
627 *a = p->a;
628 *b = p->b;
629 return TRUE;
630 }
631 else if (half == start)
632 break;
633 else if (ch > p->ch)
634 start = half;
635 else
636 end = half;
637 }
638 }
639  
640 *a = ch;
641 *b = 0;
642  
643 return FALSE;
644 }
645  
646 /**
647 * g_unichar_compose:
648 * @a: a Unicode character
649 * @b: a Unicode character
650 * @ch: return location for the composed character
651 *
652 * Performs a single composition step of the
653 * Unicode canonical composition algorithm.
654 *
655 * This function includes algorithmic Hangul Jamo composition,
656 * but it is not exactly the inverse of g_unichar_decompose().
657 * No composition can have either of @a or @b equal to zero.
658 * To be precise, this function composes if and only if
659 * there exists a Primary Composite P which is canonically
660 * equivalent to the sequence <@a,@b>. See the Unicode
661 * Standard for the definition of Primary Composite.
662 *
663 * If @a and @b do not compose a new character, @ch is set to zero.
664 *
665 * See
666 * [UAX#15](http://unicode.org/reports/tr15/)
667 * for details.
668 *
669 * Returns: %TRUE if the characters could be composed
670 *
671 * Since: 2.30
672 */
673 gboolean
674 g_unichar_compose (gunichar a,
675 gunichar b,
676 gunichar *ch)
677 {
678 if (combine (a, b, ch))
679 return TRUE;
680  
681 *ch = 0;
682 return FALSE;
683 }
684  
685 /**
686 * g_unichar_fully_decompose:
687 * @ch: a Unicode character.
688 * @compat: whether perform canonical or compatibility decomposition
689 * @result: (allow-none): location to store decomposed result, or %NULL
690 * @result_len: length of @result
691 *
692 * Computes the canonical or compatibility decomposition of a
693 * Unicode character. For compatibility decomposition,
694 * pass %TRUE for @compat; for canonical decomposition
695 * pass %FALSE for @compat.
696 *
697 * The decomposed sequence is placed in @result. Only up to
698 * @result_len characters are written into @result. The length
699 * of the full decomposition (irrespective of @result_len) is
700 * returned by the function. For canonical decomposition,
701 * currently all decompositions are of length at most 4, but
702 * this may change in the future (very unlikely though).
703 * At any rate, Unicode does guarantee that a buffer of length
704 * 18 is always enough for both compatibility and canonical
705 * decompositions, so that is the size recommended. This is provided
706 * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH.
707 *
708 * See
709 * [UAX#15](http://unicode.org/reports/tr15/)
710 * for details.
711 *
712 * Returns: the length of the full decomposition.
713 *
714 * Since: 2.30
715 **/
716 gsize
717 g_unichar_fully_decompose (gunichar ch,
718 gboolean compat,
719 gunichar *result,
720 gsize result_len)
721 {
722 const gchar *decomp;
723 const gchar *p;
724  
725 /* Hangul syllable */
726 if (ch >= SBase && ch < SBase + SCount)
727 {
728 gsize len, i;
729 gunichar buffer[3];
730 decompose_hangul (ch, result ? buffer : NULL, &len);
731 if (result)
732 for (i = 0; i < len && i < result_len; i++)
733 result[i] = buffer[i];
734 return len;
735 }
736 else if ((decomp = find_decomposition (ch, compat)) != NULL)
737 {
738 /* Found it. */
739 gsize len, i;
740  
741 len = g_utf8_strlen (decomp, -1);
742  
743 for (p = decomp, i = 0; i < len && i < result_len; p = g_utf8_next_char (p), i++)
744 result[i] = g_utf8_get_char (p);
745  
746 return len;
747 }
748  
749 /* Does not decompose */
750 if (result && result_len >= 1)
751 *result = ch;
752 return 1;
753 }