nexmon – Blame information for rev 1
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
1 | office | 1 | #! /usr/bin/perl -w |
2 | |||
3 | # Copyright (C) 1998, 1999 Tom Tromey |
||
4 | # Copyright (C) 2001 Red Hat Software |
||
5 | |||
6 | # This program is free software; you can redistribute it and/or modify |
||
7 | # it under the terms of the GNU General Public License as published by |
||
8 | # the Free Software Foundation; either version 2, or (at your option) |
||
9 | # any later version. |
||
10 | |||
11 | # This program is distributed in the hope that it will be useful, |
||
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
14 | # GNU General Public License for more details. |
||
15 | |||
16 | # You should have received a copy of the GNU General Public License |
||
17 | # along with this program; if not, see <http://www.gnu.org/licenses/>. |
||
18 | |||
19 | # Contributer(s): |
||
20 | # Andrew Taylor <andrew.taylor@montage.ca> |
||
21 | |||
22 | # gen-unicode-tables.pl - Generate tables for libunicode from Unicode data. |
||
23 | # See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html |
||
24 | # I consider the output of this program to be unrestricted. Use it as |
||
25 | # you will. |
||
26 | |||
27 | # FIXME: |
||
28 | # * For decomp table it might make sense to use a shift count other |
||
29 | # than 8. We could easily compute the perfect shift count. |
||
30 | |||
31 | # we use some perl unicode features |
||
32 | require 5.006; |
||
33 | |||
34 | use bytes; |
||
35 | |||
36 | use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION); |
||
37 | |||
38 | |||
39 | # Names of fields in Unicode data table. |
||
40 | $CODE = 0; |
||
41 | $NAME = 1; |
||
42 | $CATEGORY = 2; |
||
43 | $COMBINING_CLASSES = 3; |
||
44 | $BIDI_CATEGORY = 4; |
||
45 | $DECOMPOSITION = 5; |
||
46 | $DECIMAL_VALUE = 6; |
||
47 | $DIGIT_VALUE = 7; |
||
48 | $NUMERIC_VALUE = 8; |
||
49 | $MIRRORED = 9; |
||
50 | $OLD_NAME = 10; |
||
51 | $COMMENT = 11; |
||
52 | $UPPER = 12; |
||
53 | $LOWER = 13; |
||
54 | $TITLE = 14; |
||
55 | |||
56 | # Names of fields in the line break table |
||
57 | $BREAK_CODE = 0; |
||
58 | $BREAK_PROPERTY = 1; |
||
59 | |||
60 | # Names of fields in the SpecialCasing table |
||
61 | $CASE_CODE = 0; |
||
62 | $CASE_LOWER = 1; |
||
63 | $CASE_TITLE = 2; |
||
64 | $CASE_UPPER = 3; |
||
65 | $CASE_CONDITION = 4; |
||
66 | |||
67 | # Names of fields in the CaseFolding table |
||
68 | $FOLDING_CODE = 0; |
||
69 | $FOLDING_STATUS = 1; |
||
70 | $FOLDING_MAPPING = 2; |
||
71 | |||
72 | # Map general category code onto symbolic name. |
||
73 | %mappings = |
||
74 | ( |
||
75 | # Normative. |
||
76 | 'Lu' => "G_UNICODE_UPPERCASE_LETTER", |
||
77 | 'Ll' => "G_UNICODE_LOWERCASE_LETTER", |
||
78 | 'Lt' => "G_UNICODE_TITLECASE_LETTER", |
||
79 | 'Mn' => "G_UNICODE_NON_SPACING_MARK", |
||
80 | 'Mc' => "G_UNICODE_SPACING_MARK", |
||
81 | 'Me' => "G_UNICODE_ENCLOSING_MARK", |
||
82 | 'Nd' => "G_UNICODE_DECIMAL_NUMBER", |
||
83 | 'Nl' => "G_UNICODE_LETTER_NUMBER", |
||
84 | 'No' => "G_UNICODE_OTHER_NUMBER", |
||
85 | 'Zs' => "G_UNICODE_SPACE_SEPARATOR", |
||
86 | 'Zl' => "G_UNICODE_LINE_SEPARATOR", |
||
87 | 'Zp' => "G_UNICODE_PARAGRAPH_SEPARATOR", |
||
88 | 'Cc' => "G_UNICODE_CONTROL", |
||
89 | 'Cf' => "G_UNICODE_FORMAT", |
||
90 | 'Cs' => "G_UNICODE_SURROGATE", |
||
91 | 'Co' => "G_UNICODE_PRIVATE_USE", |
||
92 | 'Cn' => "G_UNICODE_UNASSIGNED", |
||
93 | |||
94 | # Informative. |
||
95 | 'Lm' => "G_UNICODE_MODIFIER_LETTER", |
||
96 | 'Lo' => "G_UNICODE_OTHER_LETTER", |
||
97 | 'Pc' => "G_UNICODE_CONNECT_PUNCTUATION", |
||
98 | 'Pd' => "G_UNICODE_DASH_PUNCTUATION", |
||
99 | 'Ps' => "G_UNICODE_OPEN_PUNCTUATION", |
||
100 | 'Pe' => "G_UNICODE_CLOSE_PUNCTUATION", |
||
101 | 'Pi' => "G_UNICODE_INITIAL_PUNCTUATION", |
||
102 | 'Pf' => "G_UNICODE_FINAL_PUNCTUATION", |
||
103 | 'Po' => "G_UNICODE_OTHER_PUNCTUATION", |
||
104 | 'Sm' => "G_UNICODE_MATH_SYMBOL", |
||
105 | 'Sc' => "G_UNICODE_CURRENCY_SYMBOL", |
||
106 | 'Sk' => "G_UNICODE_MODIFIER_SYMBOL", |
||
107 | 'So' => "G_UNICODE_OTHER_SYMBOL" |
||
108 | ); |
||
109 | |||
110 | %break_mappings = |
||
111 | ( |
||
112 | 'AI' => "G_UNICODE_BREAK_AMBIGUOUS", |
||
113 | 'AL' => "G_UNICODE_BREAK_ALPHABETIC", |
||
114 | 'B2' => "G_UNICODE_BREAK_BEFORE_AND_AFTER", |
||
115 | 'BA' => "G_UNICODE_BREAK_AFTER", |
||
116 | 'BB' => "G_UNICODE_BREAK_BEFORE", |
||
117 | 'BK' => "G_UNICODE_BREAK_MANDATORY", |
||
118 | 'CB' => "G_UNICODE_BREAK_CONTINGENT", |
||
119 | 'CJ' => "G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER", |
||
120 | 'CL' => "G_UNICODE_BREAK_CLOSE_PUNCTUATION", |
||
121 | 'CM' => "G_UNICODE_BREAK_COMBINING_MARK", |
||
122 | 'CP' => "G_UNICODE_BREAK_CLOSE_PARANTHESIS", |
||
123 | 'CR' => "G_UNICODE_BREAK_CARRIAGE_RETURN", |
||
124 | 'EX' => "G_UNICODE_BREAK_EXCLAMATION", |
||
125 | 'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE", |
||
126 | 'H2' => "G_UNICODE_BREAK_HANGUL_LV_SYLLABLE", |
||
127 | 'H3' => "G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE", |
||
128 | 'HL' => "G_UNICODE_BREAK_HEBREW_LETTER", |
||
129 | 'HY' => "G_UNICODE_BREAK_HYPHEN", |
||
130 | 'ID' => "G_UNICODE_BREAK_IDEOGRAPHIC", |
||
131 | 'IN' => "G_UNICODE_BREAK_INSEPARABLE", |
||
132 | 'IS' => "G_UNICODE_BREAK_INFIX_SEPARATOR", |
||
133 | 'JL' => "G_UNICODE_BREAK_HANGUL_L_JAMO", |
||
134 | 'JT' => "G_UNICODE_BREAK_HANGUL_T_JAMO", |
||
135 | 'JV' => "G_UNICODE_BREAK_HANGUL_V_JAMO", |
||
136 | 'LF' => "G_UNICODE_BREAK_LINE_FEED", |
||
137 | 'NL' => "G_UNICODE_BREAK_NEXT_LINE", |
||
138 | 'NS' => "G_UNICODE_BREAK_NON_STARTER", |
||
139 | 'NU' => "G_UNICODE_BREAK_NUMERIC", |
||
140 | 'OP' => "G_UNICODE_BREAK_OPEN_PUNCTUATION", |
||
141 | 'PO' => "G_UNICODE_BREAK_POSTFIX", |
||
142 | 'PR' => "G_UNICODE_BREAK_PREFIX", |
||
143 | 'QU' => "G_UNICODE_BREAK_QUOTATION", |
||
144 | 'RI' => "G_UNICODE_BREAK_REGIONAL_INDICATOR", |
||
145 | 'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT", |
||
146 | 'SG' => "G_UNICODE_BREAK_SURROGATE", |
||
147 | 'SP' => "G_UNICODE_BREAK_SPACE", |
||
148 | 'SY' => "G_UNICODE_BREAK_SYMBOL", |
||
149 | 'WJ' => "G_UNICODE_BREAK_WORD_JOINER", |
||
150 | 'XX' => "G_UNICODE_BREAK_UNKNOWN", |
||
151 | 'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE" |
||
152 | ); |
||
153 | |||
154 | # Title case mappings. |
||
155 | %title_to_lower = (); |
||
156 | %title_to_upper = (); |
||
157 | |||
158 | # Maximum length of special-case strings |
||
159 | |||
160 | my @special_cases; |
||
161 | my @special_case_offsets; |
||
162 | my $special_case_offset = 0; |
||
163 | |||
164 | # Scripts |
||
165 | |||
166 | my @scripts; |
||
167 | |||
168 | # East asian widths |
||
169 | |||
170 | my @eawidths; |
||
171 | |||
172 | $do_decomp = 0; |
||
173 | $do_props = 1; |
||
174 | $do_scripts = 1; |
||
175 | if (@ARGV && $ARGV[0] eq '-decomp') |
||
176 | { |
||
177 | $do_decomp = 1; |
||
178 | $do_props = 0; |
||
179 | shift @ARGV; |
||
180 | } |
||
181 | elsif (@ARGV && $ARGV[0] eq '-both') |
||
182 | { |
||
183 | $do_decomp = 1; |
||
184 | shift @ARGV; |
||
185 | } |
||
186 | |||
187 | if (@ARGV != 2) { |
||
188 | $0 =~ s@.*/@@; |
||
189 | die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt Scripts.txt extracted/DerivedEastAsianWidth.txt \n\n"; |
||
190 | } |
||
191 | |||
192 | my ($unicodedatatxt, $linebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt, |
||
193 | $scriptstxt, $derivedeastasianwidth); |
||
194 | |||
195 | my $d = $ARGV[1]; |
||
196 | opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n"; |
||
197 | for my $f (readdir ($dir)) |
||
198 | { |
||
199 | $unicodedatatxt = "$d/$f" if ($f =~ /^UnicodeData.*\.txt/); |
||
200 | $linebreaktxt = "$d/$f" if ($f =~ /^LineBreak.*\.txt/); |
||
201 | $specialcasingtxt = "$d/$f" if ($f =~ /^SpecialCasing.*\.txt/); |
||
202 | $casefoldingtxt = "$d/$f" if ($f =~ /^CaseFolding.*\.txt/); |
||
203 | $compositionexclusionstxt = "$d/$f" if ($f =~ /^CompositionExclusions.*\.txt/); |
||
204 | $scriptstxt = "$d/$f" if ($f =~ /^Scripts.*\.txt/); |
||
205 | } |
||
206 | |||
207 | my $extd = $ARGV[1] . "/extracted"; |
||
208 | opendir (my $extdir, $extd) or die "Cannot open Unicode/extracted data dir $extd: $!\n"; |
||
209 | for my $f (readdir ($extdir)) |
||
210 | { |
||
211 | $derivedeastasianwidthtxt = "$extd/$f" if ($f =~ /^DerivedEastAsianWidth.*\.txt/); |
||
212 | } |
||
213 | |||
214 | defined $unicodedatatxt or die "Did not find UnicodeData file"; |
||
215 | defined $linebreaktxt or die "Did not find LineBreak file"; |
||
216 | defined $specialcasingtxt or die "Did not find SpecialCasing file"; |
||
217 | defined $casefoldingtxt or die "Did not find CaseFolding file"; |
||
218 | defined $compositionexclusionstxt or die "Did not find CompositionExclusions file"; |
||
219 | defined $scriptstxt or die "Did not find Scripts file"; |
||
220 | defined $derivedeastasianwidthtxt or die "Did not find DerivedEastAsianWidth file"; |
||
221 | |||
222 | print "Creating decomp table\n" if ($do_decomp); |
||
223 | print "Creating property table\n" if ($do_props); |
||
224 | |||
225 | print "Composition exlusions from $compositionexclusionstxt\n"; |
||
226 | |||
227 | open (INPUT, "< $compositionexclusionstxt") || exit 1; |
||
228 | |||
229 | while (<INPUT>) { |
||
230 | |||
231 | chop; |
||
232 | |||
233 | next if /^#/; |
||
234 | next if /^\s*$/; |
||
235 | |||
236 | s/\s*#.*//; |
||
237 | |||
238 | s/^\s*//; |
||
239 | s/\s*$//; |
||
240 | |||
241 | $composition_exclusions{hex($_)} = 1; |
||
242 | } |
||
243 | |||
244 | close INPUT; |
||
245 | |||
246 | print "Unicode data from $unicodedatatxt\n"; |
||
247 | |||
248 | open (INPUT, "< $unicodedatatxt") || exit 1; |
||
249 | |||
250 | # we save memory by skipping the huge empty area before U+E0000 |
||
251 | my $pages_before_e0000; |
||
252 | |||
253 | $last_code = -1; |
||
254 | while (<INPUT>) |
||
255 | { |
||
256 | chop; |
||
257 | @fields = split (';', $_, 30); |
||
258 | if ($#fields != 14) |
||
259 | { |
||
260 | printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields); |
||
261 | } |
||
262 | |||
263 | $code = hex ($fields[$CODE]); |
||
264 | |||
265 | if ($code >= 0xE0000 and $last_code < 0xE0000) |
||
266 | { |
||
267 | $pages_before_e0000 = ($last_code >> 8) + 1; |
||
268 | } |
||
269 | |||
270 | if ($code > $last_code + 1) |
||
271 | { |
||
272 | # Found a gap. |
||
273 | if ($fields[$NAME] =~ /Last>/) |
||
274 | { |
||
275 | # Fill the gap with the last character read, |
||
276 | # since this was a range specified in the char database |
||
277 | @gfields = @fields; |
||
278 | } |
||
279 | else |
||
280 | { |
||
281 | # The gap represents undefined characters. Only the type |
||
282 | # matters. |
||
283 | @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '', |
||
284 | '', '', '', ''); |
||
285 | } |
||
286 | for (++$last_code; $last_code < $code; ++$last_code) |
||
287 | { |
||
288 | $gfields{$CODE} = sprintf ("%04x", $last_code); |
||
289 | &process_one ($last_code, @gfields); |
||
290 | } |
||
291 | } |
||
292 | &process_one ($code, @fields); |
||
293 | $last_code = $code; |
||
294 | } |
||
295 | |||
296 | close INPUT; |
||
297 | |||
298 | @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '', |
||
299 | '', '', '', ''); |
||
300 | for (++$last_code; $last_code <= 0x10FFFF; ++$last_code) |
||
301 | { |
||
302 | $gfields{$CODE} = sprintf ("%04x", $last_code); |
||
303 | &process_one ($last_code, @gfields); |
||
304 | } |
||
305 | --$last_code; # Want last to be 0x10FFFF. |
||
306 | |||
307 | print "Creating line break table\n"; |
||
308 | |||
309 | print "Line break data from $linebreaktxt\n"; |
||
310 | |||
311 | open (INPUT, "< $linebreaktxt") || exit 1; |
||
312 | |||
313 | $last_code = -1; |
||
314 | while (<INPUT>) |
||
315 | { |
||
316 | my ($start_code, $end_code); |
||
317 | |||
318 | chop; |
||
319 | |||
320 | next if /^#/; |
||
321 | next if /^$/; |
||
322 | |||
323 | s/\s*#.*//; |
||
324 | |||
325 | @fields = split (';', $_, 30); |
||
326 | if ($#fields != 1) |
||
327 | { |
||
328 | printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields); |
||
329 | next; |
||
330 | } |
||
331 | |||
332 | if ($fields[$CODE] =~ /([A-F0-9]{4,6})\.\.([A-F0-9]{4,6})/) |
||
333 | { |
||
334 | $start_code = hex ($1); |
||
335 | $end_code = hex ($2); |
||
336 | } else { |
||
337 | $start_code = $end_code = hex ($fields[$CODE]); |
||
338 | |||
339 | } |
||
340 | |||
341 | if ($start_code > $last_code + 1) |
||
342 | { |
||
343 | # The gap represents undefined characters. If assigned, |
||
344 | # they are AL, if not assigned, XX |
||
345 | for (++$last_code; $last_code < $start_code; ++$last_code) |
||
346 | { |
||
347 | if ($type[$last_code] eq 'Cn') |
||
348 | { |
||
349 | $break_props[$last_code] = 'XX'; |
||
350 | } |
||
351 | else |
||
352 | { |
||
353 | $break_props[$last_code] = 'AL'; |
||
354 | } |
||
355 | } |
||
356 | } |
||
357 | |||
358 | for ($last_code = $start_code; $last_code <= $end_code; $last_code++) |
||
359 | { |
||
360 | $break_props[$last_code] = $fields[$BREAK_PROPERTY]; |
||
361 | } |
||
362 | |||
363 | $last_code = $end_code; |
||
364 | } |
||
365 | |||
366 | close INPUT; |
||
367 | |||
368 | for (++$last_code; $last_code <= 0x10FFFF; ++$last_code) |
||
369 | { |
||
370 | if ($type[$last_code] eq 'Cn') |
||
371 | { |
||
372 | $break_props[$last_code] = 'XX'; |
||
373 | } |
||
374 | else |
||
375 | { |
||
376 | $break_props[$last_code] = 'AL'; |
||
377 | } |
||
378 | } |
||
379 | --$last_code; # Want last to be 0x10FFFF. |
||
380 | |||
381 | print STDERR "Last code is not 0x10FFFF" if ($last_code != 0x10FFFF); |
||
382 | |||
383 | print "Reading special-casing table for case conversion\n"; |
||
384 | |||
385 | open (INPUT, "< $specialcasingtxt") || exit 1; |
||
386 | |||
387 | while (<INPUT>) |
||
388 | { |
||
389 | my $code; |
||
390 | |||
391 | chop; |
||
392 | |||
393 | next if /^#/; |
||
394 | next if /^\s*$/; |
||
395 | |||
396 | s/\s*#.*//; |
||
397 | |||
398 | @fields = split ('\s*;\s*', $_, 30); |
||
399 | |||
400 | $raw_code = $fields[$CASE_CODE]; |
||
401 | $code = hex ($raw_code); |
||
402 | |||
403 | if ($#fields != 4 && $#fields != 5) |
||
404 | { |
||
405 | printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields); |
||
406 | next; |
||
407 | } |
||
408 | |||
409 | if (!defined $type[$code]) |
||
410 | { |
||
411 | printf STDERR "Special case for code point: $code, which has no defined type\n"; |
||
412 | next; |
||
413 | } |
||
414 | |||
415 | if (defined $fields[5]) { |
||
416 | # Ignore conditional special cases - we'll handle them in code |
||
417 | next; |
||
418 | } |
||
419 | |||
420 | if ($type[$code] eq 'Lu') |
||
421 | { |
||
422 | (hex $fields[$CASE_UPPER] == $code) || die "$raw_code is Lu and UCD_Upper($raw_code) != $raw_code"; |
||
423 | |||
424 | &add_special_case ($code, $value[$code], $fields[$CASE_LOWER], $fields[$CASE_TITLE]); |
||
425 | |||
426 | } elsif ($type[$code] eq 'Lt') |
||
427 | { |
||
428 | (hex $fields[$CASE_TITLE] == $code) || die "$raw_code is Lt and UCD_Title($raw_code) != $raw_code"; |
||
429 | |||
430 | &add_special_case ($code, undef, $fields[$CASE_LOWER], $fields[$CASE_UPPER]); |
||
431 | } elsif ($type[$code] eq 'Ll') |
||
432 | { |
||
433 | (hex $fields[$CASE_LOWER] == $code) || die "$raw_code is Ll and UCD_Lower($raw_code) != $raw_code"; |
||
434 | |||
435 | &add_special_case ($code, $value[$code], $fields[$CASE_UPPER], $fields[$CASE_TITLE]); |
||
436 | } else { |
||
437 | printf STDERR "Special case for non-alphabetic code point: $raw_code\n"; |
||
438 | next; |
||
439 | } |
||
440 | } |
||
441 | |||
442 | close INPUT; |
||
443 | |||
444 | open (INPUT, "< $casefoldingtxt") || exit 1; |
||
445 | |||
446 | my $casefoldlen = 0; |
||
447 | my @casefold; |
||
448 | |||
449 | while (<INPUT>) |
||
450 | { |
||
451 | my $code; |
||
452 | |||
453 | chop; |
||
454 | |||
455 | next if /^#/; |
||
456 | next if /^\s*$/; |
||
457 | |||
458 | s/\s*#.*//; |
||
459 | |||
460 | @fields = split ('\s*;\s*', $_, 30); |
||
461 | |||
462 | $raw_code = $fields[$FOLDING_CODE]; |
||
463 | $code = hex ($raw_code); |
||
464 | |||
465 | if ($#fields != 3) |
||
466 | { |
||
467 | printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields); |
||
468 | next; |
||
469 | } |
||
470 | |||
471 | # we don't use Simple or Turkic rules here |
||
472 | next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/); |
||
473 | |||
474 | @values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING]; |
||
475 | |||
476 | # Check simple case |
||
477 | |||
478 | if (@values == 1 && |
||
479 | !(defined $value[$code] && $value[$code] >= 0x1000000) && |
||
480 | defined $type[$code]) { |
||
481 | |||
482 | my $lower; |
||
483 | if ($type[$code] eq 'Ll') |
||
484 | { |
||
485 | $lower = $code; |
||
486 | } elsif ($type[$code] eq 'Lt') |
||
487 | { |
||
488 | $lower = $title_to_lower{$code}; |
||
489 | } elsif ($type[$code] eq 'Lu') |
||
490 | { |
||
491 | $lower = $value[$code]; |
||
492 | } else { |
||
493 | $lower = $code; |
||
494 | } |
||
495 | |||
496 | if ($lower == $values[0]) { |
||
497 | next; |
||
498 | } |
||
499 | } |
||
500 | |||
501 | my $string = pack ("U*", @values); |
||
502 | |||
503 | if (1 + &length_in_bytes ($string) > $casefoldlen) { |
||
504 | $casefoldlen = 1 + &length_in_bytes ($string); |
||
505 | } |
||
506 | |||
507 | push @casefold, [ $code, &escape ($string) ]; |
||
508 | } |
||
509 | |||
510 | close INPUT; |
||
511 | |||
512 | print "Reading scripts\n"; |
||
513 | |||
514 | open (INPUT, "< $scriptstxt") || exit 1; |
||
515 | |||
516 | while (<INPUT>) { |
||
517 | s/#.*//; |
||
518 | next if /^\s*$/; |
||
519 | if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) { |
||
520 | die "Cannot parse line: '$_'\n"; |
||
521 | } |
||
522 | |||
523 | if (defined $2) { |
||
524 | push @scripts, [ hex $1, hex $2, uc $3 ]; |
||
525 | } else { |
||
526 | push @scripts, [ hex $1, hex $1, uc $3 ]; |
||
527 | } |
||
528 | } |
||
529 | |||
530 | close INPUT; |
||
531 | |||
532 | print "Reading derived east asian widths\n"; |
||
533 | |||
534 | open (INPUT, "< $derivedeastasianwidthtxt") || exit 1; |
||
535 | |||
536 | while (<INPUT>) |
||
537 | { |
||
538 | my ($start_code, $end_code); |
||
539 | |||
540 | chop; |
||
541 | |||
542 | s/#.*//; |
||
543 | next if /^\s*$/; |
||
544 | if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) { |
||
545 | die "Cannot parse line: '$_'\n"; |
||
546 | } |
||
547 | |||
548 | if (defined $2) { |
||
549 | push @eawidths, [ hex $1, hex $2, $3 ]; |
||
550 | } else { |
||
551 | push @eawidths, [ hex $1, hex $1, $3 ]; |
||
552 | } |
||
553 | } |
||
554 | |||
555 | close INPUT; |
||
556 | |||
557 | if ($do_props) { |
||
558 | &print_tables ($last_code) |
||
559 | } |
||
560 | if ($do_decomp) { |
||
561 | &print_decomp ($last_code); |
||
562 | &output_composition_table; |
||
563 | } |
||
564 | &print_line_break ($last_code); |
||
565 | |||
566 | if ($do_scripts) { |
||
567 | &print_scripts |
||
568 | } |
||
569 | |||
570 | exit 0; |
||
571 | |||
572 | |||
573 | # perl "length" returns the length in characters |
||
574 | sub length_in_bytes |
||
575 | { |
||
576 | my ($string) = @_; |
||
577 | |||
578 | return length $string; |
||
579 | } |
||
580 | |||
581 | # Process a single character. |
||
582 | sub process_one |
||
583 | { |
||
584 | my ($code, @fields) = @_; |
||
585 | |||
586 | $type[$code] = $fields[$CATEGORY]; |
||
587 | if ($type[$code] eq 'Nd') |
||
588 | { |
||
589 | $value[$code] = int ($fields[$DECIMAL_VALUE]); |
||
590 | } |
||
591 | elsif ($type[$code] eq 'Ll') |
||
592 | { |
||
593 | $value[$code] = hex ($fields[$UPPER]); |
||
594 | } |
||
595 | elsif ($type[$code] eq 'Lu') |
||
596 | { |
||
597 | $value[$code] = hex ($fields[$LOWER]); |
||
598 | } |
||
599 | |||
600 | if ($type[$code] eq 'Lt') |
||
601 | { |
||
602 | $title_to_lower{$code} = hex ($fields[$LOWER]); |
||
603 | $title_to_upper{$code} = hex ($fields[$UPPER]); |
||
604 | } |
||
605 | |||
606 | $cclass[$code] = $fields[$COMBINING_CLASSES]; |
||
607 | |||
608 | # Handle decompositions. |
||
609 | if ($fields[$DECOMPOSITION] ne '') |
||
610 | { |
||
611 | if ($fields[$DECOMPOSITION] =~ s/\<.*\>\s*//) { |
||
612 | $decompose_compat[$code] = 1; |
||
613 | } else { |
||
614 | $decompose_compat[$code] = 0; |
||
615 | |||
616 | if (!exists $composition_exclusions{$code}) { |
||
617 | $compositions{$code} = $fields[$DECOMPOSITION]; |
||
618 | } |
||
619 | } |
||
620 | $decompositions[$code] = $fields[$DECOMPOSITION]; |
||
621 | } |
||
622 | } |
||
623 | |||
624 | sub print_tables |
||
625 | { |
||
626 | my ($last) = @_; |
||
627 | my ($outfile) = "gunichartables.h"; |
||
628 | |||
629 | local ($bytes_out) = 0; |
||
630 | |||
631 | print "Writing $outfile...\n"; |
||
632 | |||
633 | open (OUT, "> $outfile"); |
||
634 | |||
635 | print OUT "/* This file is automatically generated. DO NOT EDIT!\n"; |
||
636 | print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n"; |
||
637 | |||
638 | print OUT "#ifndef CHARTABLES_H\n"; |
||
639 | print OUT "#define CHARTABLES_H\n\n"; |
||
640 | |||
641 | print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n"; |
||
642 | |||
643 | printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last; |
||
644 | |||
645 | printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n"; |
||
646 | |||
647 | my $last_part1 = ($pages_before_e0000 * 256) - 1; |
||
648 | printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1; |
||
649 | printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1; |
||
650 | |||
651 | $table_index = 0; |
||
652 | printf OUT "static const char type_data[][256] = {\n"; |
||
653 | for ($count = 0; $count <= $last; $count += 256) |
||
654 | { |
||
655 | $row[$count / 256] = &print_row ($count, 1, \&fetch_type); |
||
656 | } |
||
657 | printf OUT "\n};\n\n"; |
||
658 | |||
659 | printf OUT "/* U+0000 through U+%04X */\n", $last_part1; |
||
660 | print OUT "static const gint16 type_table_part1[$pages_before_e0000] = {\n"; |
||
661 | for ($count = 0; $count <= $last_part1; $count += 256) |
||
662 | { |
||
663 | print OUT ",\n" if $count > 0; |
||
664 | print OUT " ", $row[$count / 256]; |
||
665 | $bytes_out += 2; |
||
666 | } |
||
667 | print OUT "\n};\n\n"; |
||
668 | |||
669 | printf OUT "/* U+E0000 through U+%04X */\n", $last; |
||
670 | print OUT "static const gint16 type_table_part2[768] = {\n"; |
||
671 | for ($count = 0xE0000; $count <= $last; $count += 256) |
||
672 | { |
||
673 | print OUT ",\n" if $count > 0xE0000; |
||
674 | print OUT " ", $row[$count / 256]; |
||
675 | $bytes_out += 2; |
||
676 | } |
||
677 | print OUT "\n};\n\n"; |
||
678 | |||
679 | |||
680 | # |
||
681 | # Now print attribute table. |
||
682 | # |
||
683 | |||
684 | $table_index = 0; |
||
685 | printf OUT "static const gunichar attr_data[][256] = {\n"; |
||
686 | for ($count = 0; $count <= $last; $count += 256) |
||
687 | { |
||
688 | $row[$count / 256] = &print_row ($count, 4, \&fetch_attr); |
||
689 | } |
||
690 | printf OUT "\n};\n\n"; |
||
691 | |||
692 | printf OUT "/* U+0000 through U+%04X */\n", $last_part1; |
||
693 | print OUT "static const gint16 attr_table_part1[$pages_before_e0000] = {\n"; |
||
694 | for ($count = 0; $count <= $last_part1; $count += 256) |
||
695 | { |
||
696 | print OUT ",\n" if $count > 0; |
||
697 | print OUT " ", $row[$count / 256]; |
||
698 | $bytes_out += 2; |
||
699 | } |
||
700 | print OUT "\n};\n\n"; |
||
701 | |||
702 | printf OUT "/* U+E0000 through U+%04X */\n", $last; |
||
703 | print OUT "static const gint16 attr_table_part2[768] = {\n"; |
||
704 | for ($count = 0xE0000; $count <= $last; $count += 256) |
||
705 | { |
||
706 | print OUT ",\n" if $count > 0xE0000; |
||
707 | print OUT " ", $row[$count / 256]; |
||
708 | $bytes_out += 2; |
||
709 | } |
||
710 | print OUT "\n};\n\n"; |
||
711 | |||
712 | # |
||
713 | # print title case table |
||
714 | # |
||
715 | |||
716 | print OUT "static const gunichar title_table[][3] = {\n"; |
||
717 | my ($item); |
||
718 | my ($first) = 1; |
||
719 | foreach $item (sort keys %title_to_lower) |
||
720 | { |
||
721 | print OUT ",\n" |
||
722 | unless $first; |
||
723 | $first = 0; |
||
724 | printf OUT " { 0x%04x, 0x%04x, 0x%04x }", $item, $title_to_upper{$item}, $title_to_lower{$item}; |
||
725 | $bytes_out += 12; |
||
726 | } |
||
727 | print OUT "\n};\n\n"; |
||
728 | |||
729 | # |
||
730 | # And special case conversion table -- conversions that change length |
||
731 | # |
||
732 | &output_special_case_table (\*OUT); |
||
733 | &output_casefold_table (\*OUT); |
||
734 | |||
735 | # |
||
736 | # And the widths tables |
||
737 | # |
||
738 | &output_width_tables (\*OUT); |
||
739 | |||
740 | print OUT "#endif /* CHARTABLES_H */\n"; |
||
741 | |||
742 | close (OUT); |
||
743 | |||
744 | printf STDERR "Generated %d bytes in tables\n", $bytes_out; |
||
745 | } |
||
746 | |||
747 | # A fetch function for the type table. |
||
748 | sub fetch_type |
||
749 | { |
||
750 | my ($index) = @_; |
||
751 | return $mappings{$type[$index]}; |
||
752 | } |
||
753 | |||
754 | # A fetch function for the attribute table. |
||
755 | sub fetch_attr |
||
756 | { |
||
757 | my ($index) = @_; |
||
758 | if (defined $value[$index]) |
||
759 | { |
||
760 | return sprintf ("0x%04x", $value[$index]); |
||
761 | } |
||
762 | else |
||
763 | { |
||
764 | return "0x0000"; |
||
765 | } |
||
766 | } |
||
767 | |||
768 | sub print_row |
||
769 | { |
||
770 | my ($start, $typsize, $fetcher) = @_; |
||
771 | |||
772 | my ($i); |
||
773 | my (@values); |
||
774 | my ($flag) = 1; |
||
775 | my ($off); |
||
776 | |||
777 | for ($off = 0; $off < 256; ++$off) |
||
778 | { |
||
779 | $values[$off] = $fetcher->($off + $start); |
||
780 | if ($values[$off] ne $values[0]) |
||
781 | { |
||
782 | $flag = 0; |
||
783 | } |
||
784 | } |
||
785 | if ($flag) |
||
786 | { |
||
787 | return $values[0] . " + G_UNICODE_MAX_TABLE_INDEX"; |
||
788 | } |
||
789 | |||
790 | printf OUT ",\n" if ($table_index != 0); |
||
791 | printf OUT " { /* page %d, index %d */\n ", $start / 256, $table_index; |
||
792 | my ($column) = 4; |
||
793 | for ($i = $start; $i < $start + 256; ++$i) |
||
794 | { |
||
795 | print OUT ", " |
||
796 | if $i > $start; |
||
797 | my ($text) = $values[$i - $start]; |
||
798 | if (length ($text) + $column + 2 > 78) |
||
799 | { |
||
800 | print OUT "\n "; |
||
801 | $column = 4; |
||
802 | } |
||
803 | print OUT $text; |
||
804 | $column += length ($text) + 2; |
||
805 | } |
||
806 | print OUT "\n }"; |
||
807 | |||
808 | $bytes_out += 256 * $typsize; |
||
809 | |||
810 | return sprintf "%d /* page %d */", $table_index++, $start / 256; |
||
811 | } |
||
812 | |||
813 | sub escape |
||
814 | { |
||
815 | my ($string) = @_; |
||
816 | |||
817 | my $escaped = unpack("H*", $string); |
||
818 | $escaped =~ s/(.{2})/\\x$1/g; |
||
819 | |||
820 | return $escaped; |
||
821 | } |
||
822 | |||
823 | # Returns the offset of $decomp in the offset string. Updates the |
||
824 | # referenced variables as appropriate. |
||
825 | sub handle_decomp ($$$$) |
||
826 | { |
||
827 | my ($decomp, $decomp_offsets_ref, $decomp_string_ref, $decomp_string_offset_ref) = @_; |
||
828 | my $offset = "G_UNICODE_NOT_PRESENT_OFFSET"; |
||
829 | |||
830 | if (defined $decomp) |
||
831 | { |
||
832 | if (defined $decomp_offsets_ref->{$decomp}) |
||
833 | { |
||
834 | $offset = $decomp_offsets_ref->{$decomp}; |
||
835 | } |
||
836 | else |
||
837 | { |
||
838 | $offset = ${$decomp_string_offset_ref}; |
||
839 | $decomp_offsets_ref->{$decomp} = $offset; |
||
840 | ${$decomp_string_ref} .= "\n \"" . &escape ($decomp) . "\\0\" /* offset ${$decomp_string_offset_ref} */"; |
||
841 | ${$decomp_string_offset_ref} += &length_in_bytes ($decomp) + 1; |
||
842 | } |
||
843 | } |
||
844 | |||
845 | return $offset; |
||
846 | } |
||
847 | |||
848 | # Generate the character decomposition header. |
||
849 | sub print_decomp |
||
850 | { |
||
851 | my ($last) = @_; |
||
852 | my ($outfile) = "gunidecomp.h"; |
||
853 | |||
854 | local ($bytes_out) = 0; |
||
855 | |||
856 | print "Writing $outfile...\n"; |
||
857 | |||
858 | open (OUT, "> $outfile") || exit 1; |
||
859 | |||
860 | print OUT "/* This file is automatically generated. DO NOT EDIT! */\n\n"; |
||
861 | print OUT "#ifndef DECOMP_H\n"; |
||
862 | print OUT "#define DECOMP_H\n\n"; |
||
863 | |||
864 | printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last; |
||
865 | |||
866 | printf OUT "#define G_UNICODE_MAX_TABLE_INDEX (0x110000 / 256)\n\n"; |
||
867 | |||
868 | my $last_part1 = ($pages_before_e0000 * 256) - 1; |
||
869 | printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1; |
||
870 | printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1; |
||
871 | |||
872 | $NOT_PRESENT_OFFSET = 65535; |
||
873 | print OUT "#define G_UNICODE_NOT_PRESENT_OFFSET $NOT_PRESENT_OFFSET\n\n"; |
||
874 | |||
875 | my ($count, @row); |
||
876 | $table_index = 0; |
||
877 | printf OUT "static const guchar cclass_data[][256] = {\n"; |
||
878 | for ($count = 0; $count <= $last; $count += 256) |
||
879 | { |
||
880 | $row[$count / 256] = &print_row ($count, 1, \&fetch_cclass); |
||
881 | } |
||
882 | printf OUT "\n};\n\n"; |
||
883 | |||
884 | print OUT "static const gint16 combining_class_table_part1[$pages_before_e0000] = {\n"; |
||
885 | for ($count = 0; $count <= $last_part1; $count += 256) |
||
886 | { |
||
887 | print OUT ",\n" if $count > 0; |
||
888 | print OUT " ", $row[$count / 256]; |
||
889 | $bytes_out += 2; |
||
890 | } |
||
891 | print OUT "\n};\n\n"; |
||
892 | |||
893 | print OUT "static const gint16 combining_class_table_part2[768] = {\n"; |
||
894 | for ($count = 0xE0000; $count <= $last; $count += 256) |
||
895 | { |
||
896 | print OUT ",\n" if $count > 0xE0000; |
||
897 | print OUT " ", $row[$count / 256]; |
||
898 | $bytes_out += 2; |
||
899 | } |
||
900 | print OUT "\n};\n\n"; |
||
901 | |||
902 | print OUT "typedef struct\n{\n"; |
||
903 | print OUT " gunichar ch;\n"; |
||
904 | print OUT " guint16 canon_offset;\n"; |
||
905 | print OUT " guint16 compat_offset;\n"; |
||
906 | print OUT "} decomposition;\n\n"; |
||
907 | |||
908 | print OUT "static const decomposition decomp_table[] =\n{\n"; |
||
909 | my ($iter); |
||
910 | my ($first) = 1; |
||
911 | my ($decomp_string) = ""; |
||
912 | my ($decomp_string_offset) = 0; |
||
913 | for ($count = 0; $count <= $last; ++$count) |
||
914 | { |
||
915 | if (defined $decompositions[$count]) |
||
916 | { |
||
917 | print OUT ",\n" |
||
918 | if ! $first; |
||
919 | $first = 0; |
||
920 | |||
921 | my $canon_decomp; |
||
922 | my $compat_decomp; |
||
923 | |||
924 | if (!$decompose_compat[$count]) { |
||
925 | $canon_decomp = make_decomp ($count, 0); |
||
926 | } |
||
927 | $compat_decomp = make_decomp ($count, 1); |
||
928 | |||
929 | if (defined $canon_decomp && $compat_decomp eq $canon_decomp) { |
||
930 | undef $compat_decomp; |
||
931 | } |
||
932 | |||
933 | my $canon_offset = handle_decomp ($canon_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset); |
||
934 | my $compat_offset = handle_decomp ($compat_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset); |
||
935 | |||
936 | die if $decomp_string_offset > $NOT_PRESENT_OFFSET; |
||
937 | |||
938 | printf OUT qq( { 0x%04x, $canon_offset, $compat_offset }), $count; |
||
939 | $bytes_out += 8; |
||
940 | } |
||
941 | } |
||
942 | print OUT "\n};\n\n"; |
||
943 | $bytes_out += $decomp_string_offset + 1; |
||
944 | |||
945 | printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string; |
||
946 | |||
947 | print OUT "typedef struct\n{\n"; |
||
948 | print OUT " gunichar ch;\n"; |
||
949 | print OUT " gunichar a;\n"; |
||
950 | print OUT " gunichar b;\n"; |
||
951 | print OUT "} decomposition_step;\n\n"; |
||
952 | |||
953 | # There's lots of room to optimize the following table... |
||
954 | print OUT "static const decomposition_step decomp_step_table[] =\n{\n"; |
||
955 | $first = 1; |
||
956 | my @steps = (); |
||
957 | for ($count = 0; $count <= $last; ++$count) |
||
958 | { |
||
959 | if ((defined $decompositions[$count]) && (!$decompose_compat[$count])) |
||
960 | { |
||
961 | print OUT ",\n" |
||
962 | if ! $first; |
||
963 | $first = 0; |
||
964 | my @list; |
||
965 | @list = (split(' ', $decompositions[$count]), "0"); |
||
966 | printf OUT qq( { 0x%05x, 0x%05x, 0x%05x }), $count, hex($list[0]), hex($list[1]); |
||
967 | # don't include 1:1 in the compose table |
||
968 | push @steps, [ ($count, hex($list[0]), hex($list[1])) ] |
||
969 | if hex($list[1]) |
||
970 | } |
||
971 | } |
||
972 | print OUT "\n};\n\n"; |
||
973 | |||
974 | print OUT "#endif /* DECOMP_H */\n"; |
||
975 | |||
976 | printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out; |
||
977 | } |
||
978 | |||
979 | sub print_line_break |
||
980 | { |
||
981 | my ($last) = @_; |
||
982 | my ($outfile) = "gunibreak.h"; |
||
983 | |||
984 | local ($bytes_out) = 0; |
||
985 | |||
986 | print "Writing $outfile...\n"; |
||
987 | |||
988 | open (OUT, "> $outfile"); |
||
989 | |||
990 | print OUT "/* This file is automatically generated. DO NOT EDIT!\n"; |
||
991 | print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n"; |
||
992 | |||
993 | print OUT "#ifndef BREAKTABLES_H\n"; |
||
994 | print OUT "#define BREAKTABLES_H\n\n"; |
||
995 | |||
996 | print OUT "#include <glib/gtypes.h>\n"; |
||
997 | print OUT "#include <glib/gunicode.h>\n\n"; |
||
998 | |||
999 | print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n"; |
||
1000 | |||
1001 | printf OUT "#define G_UNICODE_LAST_CHAR 0x%04X\n\n", $last; |
||
1002 | |||
1003 | printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n"; |
||
1004 | |||
1005 | my $last_part1 = ($pages_before_e0000 * 256) - 1; |
||
1006 | printf OUT "/* the last code point that should be looked up in break_property_table_part1 */\n"; |
||
1007 | printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1; |
||
1008 | |||
1009 | $table_index = 0; |
||
1010 | printf OUT "static const gint8 break_property_data[][256] = {\n"; |
||
1011 | for ($count = 0; $count <= $last; $count += 256) |
||
1012 | { |
||
1013 | $row[$count / 256] = &print_row ($count, 1, \&fetch_break_type); |
||
1014 | } |
||
1015 | printf OUT "\n};\n\n"; |
||
1016 | |||
1017 | printf OUT "/* U+0000 through U+%04X */\n", $last_part1; |
||
1018 | print OUT "static const gint16 break_property_table_part1[$pages_before_e0000] = {\n"; |
||
1019 | for ($count = 0; $count <= $last_part1; $count += 256) |
||
1020 | { |
||
1021 | print OUT ",\n" if $count > 0; |
||
1022 | print OUT " ", $row[$count / 256]; |
||
1023 | $bytes_out += 2; |
||
1024 | } |
||
1025 | print OUT "\n};\n\n"; |
||
1026 | |||
1027 | printf OUT "/* U+E0000 through U+%04X */\n", $last; |
||
1028 | print OUT "static const gint16 break_property_table_part2[768] = {\n"; |
||
1029 | for ($count = 0xE0000; $count <= $last; $count += 256) |
||
1030 | { |
||
1031 | print OUT ",\n" if $count > 0xE0000; |
||
1032 | print OUT " ", $row[$count / 256]; |
||
1033 | $bytes_out += 2; |
||
1034 | } |
||
1035 | print OUT "\n};\n\n"; |
||
1036 | |||
1037 | |||
1038 | print OUT "#endif /* BREAKTABLES_H */\n"; |
||
1039 | |||
1040 | close (OUT); |
||
1041 | |||
1042 | printf STDERR "Generated %d bytes in break tables\n", $bytes_out; |
||
1043 | } |
||
1044 | |||
1045 | |||
1046 | # A fetch function for the break properties table. |
||
1047 | sub fetch_break_type |
||
1048 | { |
||
1049 | my ($index) = @_; |
||
1050 | return $break_mappings{$break_props[$index]}; |
||
1051 | } |
||
1052 | |||
1053 | # Fetcher for combining class. |
||
1054 | sub fetch_cclass |
||
1055 | { |
||
1056 | my ($i) = @_; |
||
1057 | return $cclass[$i]; |
||
1058 | } |
||
1059 | |||
1060 | # Expand a character decomposition recursively. |
||
1061 | sub expand_decomp |
||
1062 | { |
||
1063 | my ($code, $compat) = @_; |
||
1064 | |||
1065 | my ($iter, $val); |
||
1066 | my (@result) = (); |
||
1067 | foreach $iter (split (' ', $decompositions[$code])) |
||
1068 | { |
||
1069 | $val = hex ($iter); |
||
1070 | if (defined $decompositions[$val] && |
||
1071 | ($compat || !$decompose_compat[$val])) |
||
1072 | { |
||
1073 | push (@result, &expand_decomp ($val, $compat)); |
||
1074 | } |
||
1075 | else |
||
1076 | { |
||
1077 | push (@result, $val); |
||
1078 | } |
||
1079 | } |
||
1080 | |||
1081 | return @result; |
||
1082 | } |
||
1083 | |||
1084 | sub make_decomp |
||
1085 | { |
||
1086 | my ($code, $compat) = @_; |
||
1087 | |||
1088 | my $result = ""; |
||
1089 | foreach $iter (&expand_decomp ($code, $compat)) |
||
1090 | { |
||
1091 | $result .= pack ("U", $iter); # to utf-8 |
||
1092 | } |
||
1093 | |||
1094 | $result; |
||
1095 | } |
||
1096 | # Generate special case data string from two fields |
||
1097 | sub add_special_case |
||
1098 | { |
||
1099 | my ($code, $single, $field1, $field2) = @_; |
||
1100 | |||
1101 | @values = (defined $single ? $single : (), |
||
1102 | (map { hex ($_) } split /\s+/, $field1), |
||
1103 | 0, |
||
1104 | (map { hex ($_) } split /\s+/, $field2)); |
||
1105 | |||
1106 | $result = ""; |
||
1107 | |||
1108 | for $value (@values) { |
||
1109 | $result .= pack ("U", $value); # to utf-8 |
||
1110 | } |
||
1111 | |||
1112 | push @special_case_offsets, $special_case_offset; |
||
1113 | |||
1114 | # We encode special cases up in the 0x1000000 space |
||
1115 | $value[$code] = 0x1000000 + $special_case_offset; |
||
1116 | |||
1117 | $special_case_offset += 1 + &length_in_bytes ($result); |
||
1118 | |||
1119 | push @special_cases, &escape ($result); |
||
1120 | } |
||
1121 | |||
1122 | sub output_special_case_table |
||
1123 | { |
||
1124 | my $out = shift; |
||
1125 | |||
1126 | print $out <<EOT; |
||
1127 | |||
1128 | /* Table of special cases for case conversion; each record contains |
||
1129 | * First, the best single character mapping to lowercase if Lu, |
||
1130 | * and to uppercase if Ll, followed by the output mapping for the two cases |
||
1131 | * other than the case of the codepoint, in the order [Ll],[Lu],[Lt], |
||
1132 | * encoded in UTF-8, separated and terminated by a null character. |
||
1133 | */ |
||
1134 | static const gchar special_case_table[] = { |
||
1135 | EOT |
||
1136 | |||
1137 | my $i = 0; |
||
1138 | for $case (@special_cases) { |
||
1139 | print $out qq( "$case\\0" /* offset ${special_case_offsets[$i]} */\n); |
||
1140 | $i++; |
||
1141 | } |
||
1142 | |||
1143 | print $out <<EOT; |
||
1144 | }; |
||
1145 | |||
1146 | EOT |
||
1147 | |||
1148 | print STDERR "Generated " . ($special_case_offset + 1) . " bytes in special case table\n"; |
||
1149 | } |
||
1150 | |||
1151 | sub enumerate_ordered |
||
1152 | { |
||
1153 | my ($array) = @_; |
||
1154 | |||
1155 | my $n = 0; |
||
1156 | for my $code (sort { $a <=> $b } keys %$array) { |
||
1157 | if ($array->{$code} == 1) { |
||
1158 | delete $array->{$code}; |
||
1159 | next; |
||
1160 | } |
||
1161 | $array->{$code} = $n++; |
||
1162 | } |
||
1163 | |||
1164 | return $n; |
||
1165 | } |
||
1166 | |||
1167 | sub output_composition_table |
||
1168 | { |
||
1169 | print STDERR "Generating composition table\n"; |
||
1170 | |||
1171 | local ($bytes_out) = 0; |
||
1172 | |||
1173 | my %first; |
||
1174 | my %second; |
||
1175 | |||
1176 | # First we need to go through and remove decompositions |
||
1177 | # starting with a non-starter, and single-character |
||
1178 | # decompositions. At the same time, record |
||
1179 | # the first and second character of each decomposition |
||
1180 | |||
1181 | for $code (keys %compositions) |
||
1182 | { |
||
1183 | @values = map { hex ($_) } split /\s+/, $compositions{$code}; |
||
1184 | |||
1185 | # non-starters |
||
1186 | if ($cclass[$code]) { |
||
1187 | delete $compositions{$code}; |
||
1188 | next; |
||
1189 | } |
||
1190 | if ($cclass[$values[0]]) { |
||
1191 | delete $compositions{$code}; |
||
1192 | next; |
||
1193 | } |
||
1194 | |||
1195 | # single-character decompositions |
||
1196 | if (@values == 1) { |
||
1197 | delete $compositions{$code}; |
||
1198 | next; |
||
1199 | } |
||
1200 | |||
1201 | if (@values != 2) { |
||
1202 | die "$code has more than two elements in its decomposition!\n"; |
||
1203 | } |
||
1204 | |||
1205 | if (exists $first{$values[0]}) { |
||
1206 | $first{$values[0]}++; |
||
1207 | } else { |
||
1208 | $first{$values[0]} = 1; |
||
1209 | } |
||
1210 | } |
||
1211 | |||
1212 | # Assign integer indices, removing singletons |
||
1213 | my $n_first = enumerate_ordered (\%first); |
||
1214 | |||
1215 | # Now record the second character of each (non-singleton) decomposition |
||
1216 | for $code (keys %compositions) { |
||
1217 | @values = map { hex ($_) } split /\s+/, $compositions{$code}; |
||
1218 | |||
1219 | if (exists $first{$values[0]}) { |
||
1220 | if (exists $second{$values[1]}) { |
||
1221 | $second{$values[1]}++; |
||
1222 | } else { |
||
1223 | $second{$values[1]} = 1; |
||
1224 | } |
||
1225 | } |
||
1226 | } |
||
1227 | |||
1228 | # Assign integer indices, removing duplicate |
||
1229 | my $n_second = enumerate_ordered (\%second); |
||
1230 | |||
1231 | # Build reverse table |
||
1232 | |||
1233 | my @first_singletons; |
||
1234 | my @second_singletons; |
||
1235 | my %reverse; |
||
1236 | for $code (keys %compositions) { |
||
1237 | @values = map { hex ($_) } split /\s+/, $compositions{$code}; |
||
1238 | |||
1239 | my $first = $first{$values[0]}; |
||
1240 | my $second = $second{$values[1]}; |
||
1241 | |||
1242 | if (defined $first && defined $second) { |
||
1243 | $reverse{"$first|$second"} = $code; |
||
1244 | } elsif (!defined $first) { |
||
1245 | push @first_singletons, [ $values[0], $values[1], $code ]; |
||
1246 | } else { |
||
1247 | push @second_singletons, [ $values[1], $values[0], $code ]; |
||
1248 | } |
||
1249 | } |
||
1250 | |||
1251 | @first_singletons = sort { $a->[0] <=> $b->[0] } @first_singletons; |
||
1252 | @second_singletons = sort { $a->[0] <=> $b->[0] } @second_singletons; |
||
1253 | |||
1254 | my %vals; |
||
1255 | |||
1256 | open OUT, ">gunicomp.h" or die "Cannot open gunicomp.h: $!\n"; |
||
1257 | |||
1258 | # Assign values in lookup table for all code points involved |
||
1259 | |||
1260 | my $total = 1; |
||
1261 | my $last = 0; |
||
1262 | printf OUT "#define COMPOSE_FIRST_START %d\n", $total; |
||
1263 | for $code (keys %first) { |
||
1264 | $vals{$code} = $first{$code} + $total; |
||
1265 | $last = $code if $code > $last; |
||
1266 | } |
||
1267 | $total += $n_first; |
||
1268 | $i = 0; |
||
1269 | printf OUT "#define COMPOSE_FIRST_SINGLE_START %d\n", $total; |
||
1270 | for $record (@first_singletons) { |
||
1271 | my $code = $record->[0]; |
||
1272 | $vals{$code} = $i++ + $total; |
||
1273 | $last = $code if $code > $last; |
||
1274 | } |
||
1275 | $total += @first_singletons; |
||
1276 | printf OUT "#define COMPOSE_SECOND_START %d\n", $total; |
||
1277 | for $code (keys %second) { |
||
1278 | $vals{$code} = $second{$code} + $total; |
||
1279 | $last = $code if $code > $last; |
||
1280 | } |
||
1281 | $total += $n_second; |
||
1282 | $i = 0; |
||
1283 | printf OUT "#define COMPOSE_SECOND_SINGLE_START %d\n\n", $total; |
||
1284 | for $record (@second_singletons) { |
||
1285 | my $code = $record->[0]; |
||
1286 | $vals{$code} = $i++ + $total; |
||
1287 | $last = $code if $code > $last; |
||
1288 | } |
||
1289 | |||
1290 | printf OUT "#define COMPOSE_TABLE_LAST %d\n\n", $last / 256; |
||
1291 | |||
1292 | # Output lookup table |
||
1293 | |||
1294 | my @row; |
||
1295 | $table_index = 0; |
||
1296 | printf OUT "static const guint16 compose_data[][256] = {\n"; |
||
1297 | for (my $count = 0; $count <= $last; $count += 256) |
||
1298 | { |
||
1299 | $row[$count / 256] = &print_row ($count, 2, sub { exists $vals{$_[0]} ? $vals{$_[0]} : 0; }); |
||
1300 | } |
||
1301 | printf OUT "\n};\n\n"; |
||
1302 | |||
1303 | print OUT "static const gint16 compose_table[COMPOSE_TABLE_LAST + 1] = {\n"; |
||
1304 | for (my $count = 0; $count <= $last; $count += 256) |
||
1305 | { |
||
1306 | print OUT ",\n" if $count > 0; |
||
1307 | print OUT " ", $row[$count / 256]; |
||
1308 | $bytes_out += 2; |
||
1309 | } |
||
1310 | print OUT "\n};\n\n"; |
||
1311 | |||
1312 | # Output first singletons |
||
1313 | |||
1314 | print OUT "static const gunichar compose_first_single[][2] = {\n"; |
||
1315 | $i = 0; |
||
1316 | for $record (@first_singletons) { |
||
1317 | print OUT ",\n" if $i++ > 0; |
||
1318 | printf OUT " { %#06x, %#06x }", $record->[1], $record->[2]; |
||
1319 | } |
||
1320 | print OUT "\n};\n"; |
||
1321 | |||
1322 | $bytes_out += @first_singletons * 4; |
||
1323 | |||
1324 | # Output second singletons |
||
1325 | |||
1326 | print OUT "static const gunichar compose_second_single[][2] = {\n"; |
||
1327 | $i = 0; |
||
1328 | for $record (@second_singletons) { |
||
1329 | print OUT ",\n" if $i++ > 0; |
||
1330 | printf OUT " { %#06x, %#06x }", $record->[1], $record->[2]; |
||
1331 | } |
||
1332 | print OUT "\n};\n"; |
||
1333 | |||
1334 | $bytes_out += @second_singletons * 4; |
||
1335 | |||
1336 | # Output array of composition pairs |
||
1337 | |||
1338 | print OUT <<EOT; |
||
1339 | static const guint16 compose_array[$n_first][$n_second] = { |
||
1340 | EOT |
||
1341 | |||
1342 | for (my $i = 0; $i < $n_first; $i++) { |
||
1343 | print OUT ",\n" if $i; |
||
1344 | print OUT " { "; |
||
1345 | for (my $j = 0; $j < $n_second; $j++) { |
||
1346 | print OUT ", " if $j; |
||
1347 | if (exists $reverse{"$i|$j"}) { |
||
1348 | if ($reverse{"$i|$j"} > 0xFFFF) { |
||
1349 | die "time to switch compose_array to gunichar" ; |
||
1350 | } |
||
1351 | printf OUT "0x%04x", $reverse{"$i|$j"}; |
||
1352 | } else { |
||
1353 | print OUT " 0"; |
||
1354 | } |
||
1355 | } |
||
1356 | print OUT " }"; |
||
1357 | } |
||
1358 | print OUT "\n"; |
||
1359 | |||
1360 | print OUT <<EOT; |
||
1361 | }; |
||
1362 | EOT |
||
1363 | |||
1364 | $bytes_out += $n_first * $n_second * 2; |
||
1365 | |||
1366 | printf STDERR "Generated %d bytes in compose tables\n", $bytes_out; |
||
1367 | } |
||
1368 | |||
1369 | sub output_casefold_table |
||
1370 | { |
||
1371 | my $out = shift; |
||
1372 | |||
1373 | print $out <<EOT; |
||
1374 | |||
1375 | /* Table of casefolding cases that can't be derived by lowercasing |
||
1376 | */ |
||
1377 | static const struct { |
||
1378 | guint16 ch; |
||
1379 | gchar data[$casefoldlen]; |
||
1380 | } casefold_table[] = { |
||
1381 | EOT |
||
1382 | |||
1383 | @casefold = sort { $a->[0] <=> $b->[0] } @casefold; |
||
1384 | |||
1385 | for $case (@casefold) |
||
1386 | { |
||
1387 | $code = $case->[0]; |
||
1388 | $string = $case->[1]; |
||
1389 | |||
1390 | if ($code > 0xFFFF) { |
||
1391 | die "time to switch casefold_table to gunichar" ; |
||
1392 | } |
||
1393 | |||
1394 | print $out sprintf(qq( { 0x%04x, "$string" },\n), $code); |
||
1395 | |||
1396 | } |
||
1397 | |||
1398 | print $out <<EOT; |
||
1399 | }; |
||
1400 | |||
1401 | EOT |
||
1402 | |||
1403 | my $recordlen = (2+$casefoldlen+1) & ~1; |
||
1404 | printf "Generated %d bytes for casefold table\n", $recordlen * @casefold; |
||
1405 | } |
||
1406 | |||
1407 | sub output_one_width_table |
||
1408 | { |
||
1409 | my ($out, $name, $wpe) = @_; |
||
1410 | my $start; |
||
1411 | my $end; |
||
1412 | my $wp; |
||
1413 | my $rex; |
||
1414 | |||
1415 | print $out "static const struct Interval g_unicode_width_table_${name}[] = {\n"; |
||
1416 | |||
1417 | $rex = qr/$wpe/; |
||
1418 | |||
1419 | for (my $i = 0; $i <= $#eawidths; $i++) { |
||
1420 | $start = $eawidths[$i]->[0]; |
||
1421 | $end = $eawidths[$i]->[1]; |
||
1422 | $wp = $eawidths[$i]->[2]; |
||
1423 | |||
1424 | next if ($wp !~ $rex); |
||
1425 | |||
1426 | while ($i <= $#eawidths - 1 && |
||
1427 | $eawidths[$i + 1]->[0] == $end + 1 && |
||
1428 | ($eawidths[$i + 1]->[2] =~ $rex)) { |
||
1429 | $i++; |
||
1430 | $end = $eawidths[$i]->[1]; |
||
1431 | } |
||
1432 | |||
1433 | printf $out "{0x%04X, 0x%04X},\n", $start, $end; |
||
1434 | } |
||
1435 | |||
1436 | printf $out "};\n\n"; |
||
1437 | } |
||
1438 | |||
1439 | sub output_width_tables |
||
1440 | { |
||
1441 | my $out = shift; |
||
1442 | |||
1443 | @eawidths = sort { $a->[0] <=> $b->[0] } @eawidths; |
||
1444 | |||
1445 | print $out <<EOT; |
||
1446 | |||
1447 | struct Interval |
||
1448 | { |
||
1449 | gunichar start, end; |
||
1450 | }; |
||
1451 | |||
1452 | EOT |
||
1453 | |||
1454 | &output_one_width_table ($out,"wide", "[FW]"); |
||
1455 | &output_one_width_table ($out, "ambiguous", "[A]"); |
||
1456 | } |
||
1457 | |||
1458 | sub print_scripts |
||
1459 | { |
||
1460 | my $start; |
||
1461 | my $end; |
||
1462 | my $script; |
||
1463 | my $easy_range; |
||
1464 | my $i; |
||
1465 | |||
1466 | print STDERR "Writing gscripttable.h\n"; |
||
1467 | |||
1468 | open OUT, ">gscripttable.h" or die "Cannot open gscripttable.h: $!\n"; |
||
1469 | |||
1470 | print OUT<<EOT; |
||
1471 | /* This file is automatically generated. DO NOT EDIT! |
||
1472 | Instead, edit gen-unicode-tables.pl and re-run. */ |
||
1473 | |||
1474 | #ifndef SCRIPTTABLES_H |
||
1475 | #define SCRIPTTABLES_H |
||
1476 | |||
1477 | EOT |
||
1478 | |||
1479 | @scripts = sort { $a->[0] <=> $b->[0] } @scripts; |
||
1480 | |||
1481 | $easy_range = 0x2000; |
||
1482 | |||
1483 | print OUT<<EOT; |
||
1484 | #define G_EASY_SCRIPTS_RANGE $easy_range |
||
1485 | |||
1486 | static const guchar g_script_easy_table[$easy_range] = { |
||
1487 | EOT |
||
1488 | |||
1489 | $i = 0; |
||
1490 | $end = -1; |
||
1491 | |||
1492 | for (my $c = 0; $c < $easy_range; $c++) { |
||
1493 | |||
1494 | if ($c % 3 == 0) { |
||
1495 | printf OUT "\n "; |
||
1496 | } |
||
1497 | |||
1498 | if ($c > $end) { |
||
1499 | $start = $scripts[$i]->[0]; |
||
1500 | $end = $scripts[$i]->[1]; |
||
1501 | $script = $scripts[$i]->[2]; |
||
1502 | $i++; |
||
1503 | } |
||
1504 | |||
1505 | if ($c < $start) { |
||
1506 | printf OUT " G_UNICODE_SCRIPT_UNKNOWN,"; |
||
1507 | } else { |
||
1508 | printf OUT " G_UNICODE_SCRIPT_%s,", $script; |
||
1509 | } |
||
1510 | } |
||
1511 | |||
1512 | if ($end >= $easy_range) { |
||
1513 | $i--; |
||
1514 | $scripts[$i]->[0] = $easy_range; |
||
1515 | } |
||
1516 | |||
1517 | print OUT<<EOT; |
||
1518 | |||
1519 | }; |
||
1520 | |||
1521 | static const struct { |
||
1522 | gunichar start; |
||
1523 | guint16 chars; |
||
1524 | guint16 script; |
||
1525 | } g_script_table[] = { |
||
1526 | EOT |
||
1527 | |||
1528 | for (; $i <= $#scripts; $i++) { |
||
1529 | $start = $scripts[$i]->[0]; |
||
1530 | $end = $scripts[$i]->[1]; |
||
1531 | $script = $scripts[$i]->[2]; |
||
1532 | |||
1533 | while ($i <= $#scripts - 1 && |
||
1534 | $scripts[$i + 1]->[0] == $end + 1 && |
||
1535 | $scripts[$i + 1]->[2] eq $script) { |
||
1536 | $i++; |
||
1537 | $end = $scripts[$i]->[1]; |
||
1538 | } |
||
1539 | printf OUT " { %#06x, %5d, G_UNICODE_SCRIPT_%s },\n", $start, $end - $start + 1, $script; |
||
1540 | } |
||
1541 | |||
1542 | printf OUT<<EOT; |
||
1543 | }; |
||
1544 | |||
1545 | #endif /* SCRIPTTABLES_H */ |
||
1546 | EOT |
||
1547 | |||
1548 | close OUT; |
||
1549 | } |