WebSVN – nexmon – Blame – Rev 1 – /utilities/glib/glib/gen-unicode-tables.pl

use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);

37

38

39

# Names of fields in Unicode data table.

$CODE = 0;

$NAME = 1;

$CATEGORY = 2;

$COMBINING_CLASSES = 3;

44

$BIDI_CATEGORY = 4;

45

$DECOMPOSITION = 5;

46

$DECIMAL_VALUE = 6;

47

$DIGIT_VALUE = 7;

48

$NUMERIC_VALUE = 8;

$MIRRORED = 9;

$OLD_NAME = 10;

$COMMENT = 11;

$UPPER = 12;

$LOWER = 13;

$TITLE = 14;

# Names of fields in the line break table

57

$BREAK_CODE = 0;

58

$BREAK_PROPERTY = 1;

59

60

# Names of fields in the SpecialCasing table

$CASE_CODE = 0;

$CASE_LOWER = 1;

$CASE_TITLE = 2;

$CASE_UPPER = 3;

$CASE_CONDITION = 4;

66

67

# Names of fields in the CaseFolding table

68

$FOLDING_CODE = 0;

69

$FOLDING_STATUS = 1;

70

$FOLDING_MAPPING = 2;

71

72

# Map general category code onto symbolic name.

%mappings =

(

# Normative.

'Lu' => "G_UNICODE_UPPERCASE_LETTER",

77

'Ll' => "G_UNICODE_LOWERCASE_LETTER",

78

'Lt' => "G_UNICODE_TITLECASE_LETTER",

79

'Mn' => "G_UNICODE_NON_SPACING_MARK",

80

'Mc' => "G_UNICODE_SPACING_MARK",

81

'Me' => "G_UNICODE_ENCLOSING_MARK",

82

'Nd' => "G_UNICODE_DECIMAL_NUMBER",

83

'Nl' => "G_UNICODE_LETTER_NUMBER",

84

'No' => "G_UNICODE_OTHER_NUMBER",

85

'Zs' => "G_UNICODE_SPACE_SEPARATOR",

86

'Zl' => "G_UNICODE_LINE_SEPARATOR",

87

'Zp' => "G_UNICODE_PARAGRAPH_SEPARATOR",

88

'Cc' => "G_UNICODE_CONTROL",

89

'Cf' => "G_UNICODE_FORMAT",

90

'Cs' => "G_UNICODE_SURROGATE",

91

'Co' => "G_UNICODE_PRIVATE_USE",

92

'Cn' => "G_UNICODE_UNASSIGNED",

93

94

# Informative.

95

'Lm' => "G_UNICODE_MODIFIER_LETTER",

96

'Lo' => "G_UNICODE_OTHER_LETTER",

97

'Pc' => "G_UNICODE_CONNECT_PUNCTUATION",

98

'Pd' => "G_UNICODE_DASH_PUNCTUATION",

99

'Ps' => "G_UNICODE_OPEN_PUNCTUATION",

100

'Pe' => "G_UNICODE_CLOSE_PUNCTUATION",

101

'Pi' => "G_UNICODE_INITIAL_PUNCTUATION",

102

'Pf' => "G_UNICODE_FINAL_PUNCTUATION",

103

'Po' => "G_UNICODE_OTHER_PUNCTUATION",

104

'Sm' => "G_UNICODE_MATH_SYMBOL",

105

'Sc' => "G_UNICODE_CURRENCY_SYMBOL",

106

'Sk' => "G_UNICODE_MODIFIER_SYMBOL",

107

'So' => "G_UNICODE_OTHER_SYMBOL"

);

%break_mappings =

(

'AI' => "G_UNICODE_BREAK_AMBIGUOUS",

113

'AL' => "G_UNICODE_BREAK_ALPHABETIC",

114

'B2' => "G_UNICODE_BREAK_BEFORE_AND_AFTER",

115

'BA' => "G_UNICODE_BREAK_AFTER",

116

'BB' => "G_UNICODE_BREAK_BEFORE",

117

'BK' => "G_UNICODE_BREAK_MANDATORY",

118

'CB' => "G_UNICODE_BREAK_CONTINGENT",

119

'CJ' => "G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER",

120

'CL' => "G_UNICODE_BREAK_CLOSE_PUNCTUATION",

121

'CM' => "G_UNICODE_BREAK_COMBINING_MARK",

122

'CP' => "G_UNICODE_BREAK_CLOSE_PARANTHESIS",

123

'CR' => "G_UNICODE_BREAK_CARRIAGE_RETURN",

124

'EX' => "G_UNICODE_BREAK_EXCLAMATION",

125

'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE",

126

'H2' => "G_UNICODE_BREAK_HANGUL_LV_SYLLABLE",

127

'H3' => "G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE",

128

'HL' => "G_UNICODE_BREAK_HEBREW_LETTER",

129

'HY' => "G_UNICODE_BREAK_HYPHEN",

130

'ID' => "G_UNICODE_BREAK_IDEOGRAPHIC",

131

'IN' => "G_UNICODE_BREAK_INSEPARABLE",

132

'IS' => "G_UNICODE_BREAK_INFIX_SEPARATOR",

133

'JL' => "G_UNICODE_BREAK_HANGUL_L_JAMO",

134

'JT' => "G_UNICODE_BREAK_HANGUL_T_JAMO",

135

'JV' => "G_UNICODE_BREAK_HANGUL_V_JAMO",

136

'LF' => "G_UNICODE_BREAK_LINE_FEED",

137

'NL' => "G_UNICODE_BREAK_NEXT_LINE",

138

'NS' => "G_UNICODE_BREAK_NON_STARTER",

139

'NU' => "G_UNICODE_BREAK_NUMERIC",

140

'OP' => "G_UNICODE_BREAK_OPEN_PUNCTUATION",

141

'PO' => "G_UNICODE_BREAK_POSTFIX",

142

'PR' => "G_UNICODE_BREAK_PREFIX",

143

'QU' => "G_UNICODE_BREAK_QUOTATION",

144

'RI' => "G_UNICODE_BREAK_REGIONAL_INDICATOR",

145

'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT",

146

'SG' => "G_UNICODE_BREAK_SURROGATE",

147

'SP' => "G_UNICODE_BREAK_SPACE",

148

'SY' => "G_UNICODE_BREAK_SYMBOL",

149

'WJ' => "G_UNICODE_BREAK_WORD_JOINER",

150

'XX' => "G_UNICODE_BREAK_UNKNOWN",

151

'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE"

152

);

153

154

# Title case mappings.

155

%title_to_lower = ();

156

%title_to_upper = ();

157

158

# Maximum length of special-case strings

159

160

my @special_cases;

161

my @special_case_offsets;

162

my $special_case_offset = 0;

# Scripts

my @scripts;

# East asian widths

my @eawidths;

$do_decomp = 0;

$do_props = 1;

$do_scripts = 1;

if (@ARGV && $ARGV[0] eq '-decomp')

{

$do_decomp = 1;

$do_props = 0;

shift @ARGV;

}

elsif (@ARGV && $ARGV[0] eq '-both')

{

$do_decomp = 1;

shift @ARGV;

}

if (@ARGV != 2) {

$0 =~ s@.*/@@;

    die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n       DIRECTORY should contain the following Unicode data files:\n       UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n       CompositionExclusions.txt Scripts.txt extracted/DerivedEastAsianWidth.txt \n\n";

190

}

191

192

my ($unicodedatatxt, $linebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt,

193

$scriptstxt, $derivedeastasianwidth);

194

195

my $d = $ARGV[1];

196

opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n";

197

for my $f (readdir ($dir))

198

{

199

$unicodedatatxt = "$d/$f" if ($f =~ /^UnicodeData.*\.txt/);

200

$linebreaktxt = "$d/$f" if ($f =~ /^LineBreak.*\.txt/);

201

$specialcasingtxt = "$d/$f" if ($f =~ /^SpecialCasing.*\.txt/);

202

$casefoldingtxt = "$d/$f" if ($f =~ /^CaseFolding.*\.txt/);

203

$compositionexclusionstxt = "$d/$f" if ($f =~ /^CompositionExclusions.*\.txt/);

204

$scriptstxt = "$d/$f" if ($f =~ /^Scripts.*\.txt/);

205

}

206

207

my $extd = $ARGV[1] . "/extracted";

208

opendir (my $extdir, $extd) or die "Cannot open Unicode/extracted data dir $extd: $!\n";

209

for my $f (readdir ($extdir))

210

{

211

$derivedeastasianwidthtxt = "$extd/$f" if ($f =~ /^DerivedEastAsianWidth.*\.txt/);

212

}

213

214

defined $unicodedatatxt or die "Did not find UnicodeData file";

215

defined $linebreaktxt or die "Did not find LineBreak file";

216

defined $specialcasingtxt or die "Did not find SpecialCasing file";

217

defined $casefoldingtxt or die "Did not find CaseFolding file";

218

defined $compositionexclusionstxt or die "Did not find CompositionExclusions file";

219

defined $scriptstxt or die "Did not find Scripts file";

220

defined $derivedeastasianwidthtxt or die "Did not find DerivedEastAsianWidth file";

221

222

print "Creating decomp table\n" if ($do_decomp);

223

print "Creating property table\n" if ($do_props);

224

225

print "Composition exlusions from $compositionexclusionstxt\n";

226

227

open (INPUT, "< $compositionexclusionstxt") || exit 1;

while (<INPUT>) {

chop;

next if /^#/;

next if /^\s*$/;

s/\s*#.*//;

s/^\s*//;

s/\s*$//;

$composition_exclusions{hex($_)} = 1;

}

close INPUT;

print "Unicode data from $unicodedatatxt\n";

247

248

open (INPUT, "< $unicodedatatxt") || exit 1;

249

250

# we save memory by skipping the huge empty area before U+E0000

251

my $pages_before_e0000;

$last_code = -1;

while (<INPUT>)

{

chop;

@fields = split (';', $_, 30);

258

if ($#fields != 14)

259

{

260

printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);

261

}

262

263

$code = hex ($fields[$CODE]);

264

265

if ($code >= 0xE0000 and $last_code < 0xE0000)

266

{

267

$pages_before_e0000 = ($last_code >> 8) + 1;

268

}

269

270

if ($code > $last_code + 1)

271

{

272

# Found a gap.

273

if ($fields[$NAME] =~ /Last>/)

274

{

275

# Fill the gap with the last character read,

276

# since this was a range specified in the char database

277

@gfields = @fields;

}

else

{

# The gap represents undefined characters. Only the type

282

# matters.

283

@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',

284

'', '', '', '');

285

}

286

for (++$last_code; $last_code < $code; ++$last_code)

287

{

288

$gfields{$CODE} = sprintf ("%04x", $last_code);

289

&process_one ($last_code, @gfields);

290

}

291

}

292

&process_one ($code, @fields);

293

$last_code = $code;

}

close INPUT;

@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',

299

'', '', '', '');

300

for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)

301

{

302

$gfields{$CODE} = sprintf ("%04x", $last_code);

303

&process_one ($last_code, @gfields);

304

}

305

--$last_code; # Want last to be 0x10FFFF.

306

307

print "Creating line break table\n";

308

309

print "Line break data from $linebreaktxt\n";

310

311

open (INPUT, "< $linebreaktxt") || exit 1;

$last_code = -1;

while (<INPUT>)

{

my ($start_code, $end_code);

chop;

next if /^#/;

next if /^$/;

s/\s*#.*//;

@fields = split (';', $_, 30);

326

if ($#fields != 1)

327

{

328

printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);

next;

}

if ($fields[$CODE] =~ /([A-F0-9]{4,6})\.\.([A-F0-9]{4,6})/)

333

{

334

$start_code = hex ($1);

335

$end_code = hex ($2);

336

} else {

337

$start_code = $end_code = hex ($fields[$CODE]);

}

if ($start_code > $last_code + 1)

342

{

343

# The gap represents undefined characters. If assigned,

344

# they are AL, if not assigned, XX

345

for (++$last_code; $last_code < $start_code; ++$last_code)

346

{

347

if ($type[$last_code] eq 'Cn')

348

{

349

$break_props[$last_code] = 'XX';

}

else

{

$break_props[$last_code] = 'AL';

}

}

}

for ($last_code = $start_code; $last_code <= $end_code; $last_code++)

359

{

360

$break_props[$last_code] = $fields[$BREAK_PROPERTY];

361

}

362

363

$last_code = $end_code;

}

close INPUT;

for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)

369

{

370

if ($type[$last_code] eq 'Cn')

371

{

372

$break_props[$last_code] = 'XX';

}

else

{

$break_props[$last_code] = 'AL';

377

}

378

}

379

--$last_code; # Want last to be 0x10FFFF.

380

381

print STDERR "Last code is not 0x10FFFF" if ($last_code != 0x10FFFF);

382

383

print "Reading special-casing table for case conversion\n";

384

385

open (INPUT, "< $specialcasingtxt") || exit 1;

while (<INPUT>)

{

my $code;

chop;

next if /^#/;

next if /^\s*$/;

s/\s*#.*//;

@fields = split ('\s*;\s*', $_, 30);

399

400

$raw_code = $fields[$CASE_CODE];

401

$code = hex ($raw_code);

402

403

if ($#fields != 4 && $#fields != 5)

404

{

405

printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);

next;

}

if (!defined $type[$code])

410

{

411

printf STDERR "Special case for code point: $code, which has no defined type\n";

next;

}

if (defined $fields[5]) {

416

# Ignore conditional special cases - we'll handle them in code

next;

}

if ($type[$code] eq 'Lu')

421

{

422

(hex $fields[$CASE_UPPER] == $code) || die "$raw_code is Lu and UCD_Upper($raw_code) != $raw_code";

423

424

&add_special_case ($code, $value[$code], $fields[$CASE_LOWER], $fields[$CASE_TITLE]);

425

426

} elsif ($type[$code] eq 'Lt')

427

{

428

(hex $fields[$CASE_TITLE] == $code) || die "$raw_code is Lt and UCD_Title($raw_code) != $raw_code";

429

430

&add_special_case ($code, undef, $fields[$CASE_LOWER], $fields[$CASE_UPPER]);

431

} elsif ($type[$code] eq 'Ll')

432

{

433

(hex $fields[$CASE_LOWER] == $code) || die "$raw_code is Ll and UCD_Lower($raw_code) != $raw_code";

434

435

&add_special_case ($code, $value[$code], $fields[$CASE_UPPER], $fields[$CASE_TITLE]);

436

} else {

437

printf STDERR "Special case for non-alphabetic code point: $raw_code\n";

next;

}

}

close INPUT;

open (INPUT, "< $casefoldingtxt") || exit 1;

445

446

my $casefoldlen = 0;

my @casefold;

while (<INPUT>)

{

my $code;

chop;

next if /^#/;

next if /^\s*$/;

s/\s*#.*//;

@fields = split ('\s*;\s*', $_, 30);

461

462

$raw_code = $fields[$FOLDING_CODE];

463

$code = hex ($raw_code);

if ($#fields != 3)

{

printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);

next;

}

# we don't use Simple or Turkic rules here

472

next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);

473

474

@values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];

475

476

# Check simple case

477

478

if (@values == 1 &&

479

!(defined $value[$code] && $value[$code] >= 0x1000000) &&

480

defined $type[$code]) {

481

482

my $lower;

483

if ($type[$code] eq 'Ll')

484

{

485

$lower = $code;

486

} elsif ($type[$code] eq 'Lt')

487

{

488

$lower = $title_to_lower{$code};

489

} elsif ($type[$code] eq 'Lu')

490

{

491

$lower = $value[$code];

} else {

$lower = $code;

}

if ($lower == $values[0]) {

next;

}

}

my $string = pack ("U*", @values);

502

503

if (1 + &length_in_bytes ($string) > $casefoldlen) {

504

$casefoldlen = 1 + &length_in_bytes ($string);

505

}

506

507

push @casefold, [ $code, &escape ($string) ];

}

close INPUT;

print "Reading scripts\n";

513

514

open (INPUT, "< $scriptstxt") || exit 1;

while (<INPUT>) {

s/#.*//;

next if /^\s*$/;

if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) {

520

die "Cannot parse line: '$_'\n";

}

if (defined $2) {

push @scripts, [ hex $1, hex $2, uc $3 ];

525

} else {

526

push @scripts, [ hex $1, hex $1, uc $3 ];

}

}

close INPUT;

print "Reading derived east asian widths\n";

533

534

open (INPUT, "< $derivedeastasianwidthtxt") || exit 1;

while (<INPUT>)

{

my ($start_code, $end_code);

chop;

s/#.*//;

next if /^\s*$/;

if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) {

545

die "Cannot parse line: '$_'\n";

}

if (defined $2) {

push @eawidths, [ hex $1, hex $2, $3 ];

550

} else {

551

push @eawidths, [ hex $1, hex $1, $3 ];

}

}

close INPUT;

if ($do_props) {

&print_tables ($last_code)

559

}

560

if ($do_decomp) {

561

&print_decomp ($last_code);

562

&output_composition_table;

563

}

564

&print_line_break ($last_code);

if ($do_scripts) {

&print_scripts

}

exit 0;

# perl "length" returns the length in characters

574

sub length_in_bytes

{

my ($string) = @_;

return length $string;

579

}

580

581

# Process a single character.

582

sub process_one

583

{

584

my ($code, @fields) = @_;

585

586

$type[$code] = $fields[$CATEGORY];

587

if ($type[$code] eq 'Nd')

588

{

589

$value[$code] = int ($fields[$DECIMAL_VALUE]);

590

}

591

elsif ($type[$code] eq 'Ll')

592

{

593

$value[$code] = hex ($fields[$UPPER]);

594

}

595

elsif ($type[$code] eq 'Lu')

596

{

597

$value[$code] = hex ($fields[$LOWER]);

598

}

599

600

if ($type[$code] eq 'Lt')

601

{

602

$title_to_lower{$code} = hex ($fields[$LOWER]);

603

$title_to_upper{$code} = hex ($fields[$UPPER]);

604

}

605

606

$cclass[$code] = $fields[$COMBINING_CLASSES];

607

608

# Handle decompositions.

609

if ($fields[$DECOMPOSITION] ne '')

610

{

611

if ($fields[$DECOMPOSITION] =~ s/\<.*\>\s*//) {

612

$decompose_compat[$code] = 1;

613

} else {

614

$decompose_compat[$code] = 0;

615

616

if (!exists $composition_exclusions{$code}) {

617

$compositions{$code} = $fields[$DECOMPOSITION];

618

}

619

}

620

$decompositions[$code] = $fields[$DECOMPOSITION];

}

}

sub print_tables

{

my ($last) = @_;

my ($outfile) = "gunichartables.h";

628

629

local ($bytes_out) = 0;

630

631

print "Writing $outfile...\n";

632

633

open (OUT, "> $outfile");

634

635

print OUT "/* This file is automatically generated. DO NOT EDIT!\n";

636

print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n";

637

638

print OUT "#ifndef CHARTABLES_H\n";

639

print OUT "#define CHARTABLES_H\n\n";

640

641

print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";

642

643

printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;

644

645

printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";

646

647

my $last_part1 = ($pages_before_e0000 * 256) - 1;

648

printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;

649

printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1;

650

651

$table_index = 0;

652

printf OUT "static const char type_data[][256] = {\n";

653

for ($count = 0; $count <= $last; $count += 256)

654

{

655

$row[$count / 256] = &print_row ($count, 1, \&fetch_type);

656

}

657

printf OUT "\n};\n\n";

658

659

printf OUT "/* U+0000 through U+%04X */\n", $last_part1;

660

print OUT "static const gint16 type_table_part1[$pages_before_e0000] = {\n";

661

for ($count = 0; $count <= $last_part1; $count += 256)

662

{

663

print OUT ",\n" if $count > 0;

664

print OUT " ", $row[$count / 256];

665

$bytes_out += 2;

666

}

667

print OUT "\n};\n\n";

668

669

printf OUT "/* U+E0000 through U+%04X */\n", $last;

670

print OUT "static const gint16 type_table_part2[768] = {\n";

671

for ($count = 0xE0000; $count <= $last; $count += 256)

672

{

673

print OUT ",\n" if $count > 0xE0000;

674

print OUT " ", $row[$count / 256];

675

$bytes_out += 2;

676

}

677

print OUT "\n};\n\n";

#

# Now print attribute table.

#

$table_index = 0;

printf OUT "static const gunichar attr_data[][256] = {\n";

686

for ($count = 0; $count <= $last; $count += 256)

687

{

688

$row[$count / 256] = &print_row ($count, 4, \&fetch_attr);

689

}

690

printf OUT "\n};\n\n";

691

692

printf OUT "/* U+0000 through U+%04X */\n", $last_part1;

693

print OUT "static const gint16 attr_table_part1[$pages_before_e0000] = {\n";

694

for ($count = 0; $count <= $last_part1; $count += 256)

695

{

696

print OUT ",\n" if $count > 0;

697

print OUT " ", $row[$count / 256];

698

$bytes_out += 2;

699

}

700

print OUT "\n};\n\n";

701

702

printf OUT "/* U+E0000 through U+%04X */\n", $last;

703

print OUT "static const gint16 attr_table_part2[768] = {\n";

704

for ($count = 0xE0000; $count <= $last; $count += 256)

705

{

706

print OUT ",\n" if $count > 0xE0000;

707

print OUT " ", $row[$count / 256];

708

$bytes_out += 2;

709

}

710

print OUT "\n};\n\n";

711

712

#

713

# print title case table

714

#

715

716

print OUT "static const gunichar title_table[][3] = {\n";

717

my ($item);

718

my ($first) = 1;

719

foreach $item (sort keys %title_to_lower)

{

print OUT ",\n"

unless $first;

$first = 0;

printf OUT " { 0x%04x, 0x%04x, 0x%04x }", $item, $title_to_upper{$item}, $title_to_lower{$item};

725

$bytes_out += 12;

726

}

727

print OUT "\n};\n\n";

728

729

#

730

# And special case conversion table -- conversions that change length

731

#

732

&output_special_case_table (\*OUT);

733

&output_casefold_table (\*OUT);

734

735

#

736

# And the widths tables

737

#

738

&output_width_tables (\*OUT);

739

740

print OUT "#endif /* CHARTABLES_H */\n";

close (OUT);

printf STDERR "Generated %d bytes in tables\n", $bytes_out;

745

}

746

747

# A fetch function for the type table.

sub fetch_type

{

my ($index) = @_;

return $mappings{$type[$index]};

752

}

753

754

# A fetch function for the attribute table.

sub fetch_attr

{

my ($index) = @_;

if (defined $value[$index])

759

{

760

return sprintf ("0x%04x", $value[$index]);

}

else

{

return "0x0000";

}

}

sub print_row

{

my ($start, $typsize, $fetcher) = @_;

my ($i);

my (@values);

my ($flag) = 1;

my ($off);

for ($off = 0; $off < 256; ++$off)

778

{

779

$values[$off] = $fetcher->($off + $start);

780

if ($values[$off] ne $values[0])

{

$flag = 0;

}

}

if ($flag)

{

return $values[0] . " + G_UNICODE_MAX_TABLE_INDEX";

788

}

789

790

printf OUT ",\n" if ($table_index != 0);

791

printf OUT " { /* page %d, index %d */\n ", $start / 256, $table_index;

792

my ($column) = 4;

793

for ($i = $start; $i < $start + 256; ++$i)

{

print OUT ", "

if $i > $start;

my ($text) = $values[$i - $start];

798

if (length ($text) + $column + 2 > 78)

{

print OUT "\n ";

$column = 4;

}

print OUT $text;

$column += length ($text) + 2;

}

print OUT "\n }";

$bytes_out += 256 * $typsize;

809

810

return sprintf "%d /* page %d */", $table_index++, $start / 256;

}

sub escape

{

my ($string) = @_;

my $escaped = unpack("H*", $string);

818

$escaped =~ s/(.{2})/\\x$1/g;

return $escaped;

}

# Returns the offset of $decomp in the offset string. Updates the

824

# referenced variables as appropriate.

825

sub handle_decomp ($$$$)

826

{

827

my ($decomp, $decomp_offsets_ref, $decomp_string_ref, $decomp_string_offset_ref) = @_;

828

my $offset = "G_UNICODE_NOT_PRESENT_OFFSET";

829

830

if (defined $decomp)

831

{

832

if (defined $decomp_offsets_ref->{$decomp})

833

{

834

$offset = $decomp_offsets_ref->{$decomp};

}

else

{

$offset = ${$decomp_string_offset_ref};

839

$decomp_offsets_ref->{$decomp} = $offset;

840

${$decomp_string_ref} .= "\n \"" . &escape ($decomp) . "\\0\" /* offset ${$decomp_string_offset_ref} */";

841

${$decomp_string_offset_ref} += &length_in_bytes ($decomp) + 1;

}

}

return $offset;

}

# Generate the character decomposition header.

sub print_decomp

{

my ($last) = @_;

my ($outfile) = "gunidecomp.h";

853

854

local ($bytes_out) = 0;

855

856

print "Writing $outfile...\n";

857

858

open (OUT, "> $outfile") || exit 1;

859

860

print OUT "/* This file is automatically generated. DO NOT EDIT! */\n\n";

861

print OUT "#ifndef DECOMP_H\n";

862

print OUT "#define DECOMP_H\n\n";

863

864

printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;

865

866

printf OUT "#define G_UNICODE_MAX_TABLE_INDEX (0x110000 / 256)\n\n";

867

868

my $last_part1 = ($pages_before_e0000 * 256) - 1;

869

printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;

870

printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1;

871

872

$NOT_PRESENT_OFFSET = 65535;

873

print OUT "#define G_UNICODE_NOT_PRESENT_OFFSET $NOT_PRESENT_OFFSET\n\n";

my ($count, @row);

$table_index = 0;

printf OUT "static const guchar cclass_data[][256] = {\n";

878

for ($count = 0; $count <= $last; $count += 256)

879

{

880

$row[$count / 256] = &print_row ($count, 1, \&fetch_cclass);

881

}

882

printf OUT "\n};\n\n";

883

884

print OUT "static const gint16 combining_class_table_part1[$pages_before_e0000] = {\n";

885

for ($count = 0; $count <= $last_part1; $count += 256)

886

{

887

print OUT ",\n" if $count > 0;

888

print OUT " ", $row[$count / 256];

889

$bytes_out += 2;

890

}

891

print OUT "\n};\n\n";

892

893

print OUT "static const gint16 combining_class_table_part2[768] = {\n";

894

for ($count = 0xE0000; $count <= $last; $count += 256)

895

{

896

print OUT ",\n" if $count > 0xE0000;

897

print OUT " ", $row[$count / 256];

898

$bytes_out += 2;

899

}

900

print OUT "\n};\n\n";

901

902

print OUT "typedef struct\n{\n";

903

print OUT " gunichar ch;\n";

904

print OUT " guint16 canon_offset;\n";

905

print OUT " guint16 compat_offset;\n";

906

print OUT "} decomposition;\n\n";

907

908

print OUT "static const decomposition decomp_table[] =\n{\n";

909

my ($iter);

910

my ($first) = 1;

911

my ($decomp_string) = "";

912

my ($decomp_string_offset) = 0;

913

for ($count = 0; $count <= $last; ++$count)

914

{

915

if (defined $decompositions[$count])

{

print OUT ",\n"

if ! $first;

$first = 0;

my $canon_decomp;

my $compat_decomp;

if (!$decompose_compat[$count]) {

925

$canon_decomp = make_decomp ($count, 0);

926

}

927

$compat_decomp = make_decomp ($count, 1);

928

929

if (defined $canon_decomp && $compat_decomp eq $canon_decomp) {

930

undef $compat_decomp;

931

}

932

933

my $canon_offset = handle_decomp ($canon_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset);

934

my $compat_offset = handle_decomp ($compat_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset);

935

936

die if $decomp_string_offset > $NOT_PRESENT_OFFSET;

937

938

printf OUT qq( { 0x%04x, $canon_offset, $compat_offset }), $count;

$bytes_out += 8;

}

}

print OUT "\n};\n\n";

943

$bytes_out += $decomp_string_offset + 1;

944

945

printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string;

946

947

print OUT "typedef struct\n{\n";

948

print OUT " gunichar ch;\n";

949

print OUT " gunichar a;\n";

950

print OUT " gunichar b;\n";

951

print OUT "} decomposition_step;\n\n";

952

953

# There's lots of room to optimize the following table...

954

print OUT "static const decomposition_step decomp_step_table[] =\n{\n";

955

$first = 1;

956

my @steps = ();

957

for ($count = 0; $count <= $last; ++$count)

958

{

959

if ((defined $decompositions[$count]) && (!$decompose_compat[$count]))

{

print OUT ",\n"

if ! $first;

$first = 0;

my @list;

@list = (split(' ', $decompositions[$count]), "0");

966

printf OUT qq( { 0x%05x, 0x%05x, 0x%05x }), $count, hex($list[0]), hex($list[1]);

967

# don't include 1:1 in the compose table

968

push @steps, [ ($count, hex($list[0]), hex($list[1])) ]

if hex($list[1])

}

}

print OUT "\n};\n\n";

973

974

print OUT "#endif /* DECOMP_H */\n";

975

976

printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out;

977

}

978

979

sub print_line_break

980

{

981

my ($last) = @_;

982

my ($outfile) = "gunibreak.h";

983

984

local ($bytes_out) = 0;

985

986

print "Writing $outfile...\n";

987

988

open (OUT, "> $outfile");

989

990

print OUT "/* This file is automatically generated. DO NOT EDIT!\n";

991

print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n";

992

993

print OUT "#ifndef BREAKTABLES_H\n";

994

print OUT "#define BREAKTABLES_H\n\n";

995

996

print OUT "#include <glib/gtypes.h>\n";

997

print OUT "#include <glib/gunicode.h>\n\n";

998

999

print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";

1000

1001

printf OUT "#define G_UNICODE_LAST_CHAR 0x%04X\n\n", $last;

1002

1003

printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";

1004

1005

my $last_part1 = ($pages_before_e0000 * 256) - 1;

1006

printf OUT "/* the last code point that should be looked up in break_property_table_part1 */\n";

1007

printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;

1008

1009

$table_index = 0;

1010

printf OUT "static const gint8 break_property_data[][256] = {\n";

1011

for ($count = 0; $count <= $last; $count += 256)

1012

{

1013

$row[$count / 256] = &print_row ($count, 1, \&fetch_break_type);

1014

}

1015

printf OUT "\n};\n\n";

1016

1017

printf OUT "/* U+0000 through U+%04X */\n", $last_part1;

1018

print OUT "static const gint16 break_property_table_part1[$pages_before_e0000] = {\n";

1019

for ($count = 0; $count <= $last_part1; $count += 256)

1020

{

1021

print OUT ",\n" if $count > 0;

1022

print OUT " ", $row[$count / 256];

1023

$bytes_out += 2;

1024

}

1025

print OUT "\n};\n\n";

1026

1027

printf OUT "/* U+E0000 through U+%04X */\n", $last;

1028

print OUT "static const gint16 break_property_table_part2[768] = {\n";

1029

for ($count = 0xE0000; $count <= $last; $count += 256)

1030

{

1031

print OUT ",\n" if $count > 0xE0000;

1032

print OUT " ", $row[$count / 256];

1033

$bytes_out += 2;

1034

}

1035

print OUT "\n};\n\n";

1036

1037

1038

print OUT "#endif /* BREAKTABLES_H */\n";

close (OUT);

printf STDERR "Generated %d bytes in break tables\n", $bytes_out;

}

# A fetch function for the break properties table.

1047

sub fetch_break_type

1048

{

1049

my ($index) = @_;

1050

return $break_mappings{$break_props[$index]};

1051

}

1052

1053

# Fetcher for combining class.

sub fetch_cclass

{

my ($i) = @_;

return $cclass[$i];

1058

}

1059

1060

# Expand a character decomposition recursively.

1061

sub expand_decomp

1062

{

1063

my ($code, $compat) = @_;

my ($iter, $val);

my (@result) = ();

foreach $iter (split (' ', $decompositions[$code]))

1068

{

1069

$val = hex ($iter);

1070

if (defined $decompositions[$val] &&

1071

($compat || !$decompose_compat[$val]))

1072

{

1073

push (@result, &expand_decomp ($val, $compat));

}

else

{

push (@result, $val);

}

}

return @result;

}

sub make_decomp

{

my ($code, $compat) = @_;

1087

1088

my $result = "";

1089

foreach $iter (&expand_decomp ($code, $compat))

1090

{

1091

$result .= pack ("U", $iter); # to utf-8

}

$result;

}

# Generate special case data string from two fields

1097

sub add_special_case

1098

{

1099

my ($code, $single, $field1, $field2) = @_;

1100

1101

@values = (defined $single ? $single : (),

1102

(map { hex ($_) } split /\s+/, $field1),

1103

0,

1104

(map { hex ($_) } split /\s+/, $field2));

$result = "";

for $value (@values) {

1109

$result .= pack ("U", $value); # to utf-8

1110

}

1111

1112

push @special_case_offsets, $special_case_offset;

1113

1114

# We encode special cases up in the 0x1000000 space

1115

$value[$code] = 0x1000000 + $special_case_offset;

1116

1117

$special_case_offset += 1 + &length_in_bytes ($result);

1118

1119

push @special_cases, &escape ($result);

1120

}

1121

1122

sub output_special_case_table

{

my $out = shift;

print $out <<EOT;

/* Table of special cases for case conversion; each record contains

1129

* First, the best single character mapping to lowercase if Lu,

1130

* and to uppercase if Ll, followed by the output mapping for the two cases

1131

* other than the case of the codepoint, in the order [Ll],[Lu],[Lt],

1132

* encoded in UTF-8, separated and terminated by a null character.

1133

*/

1134

static const gchar special_case_table[] = {

EOT

my $i = 0;

for $case (@special_cases) {

1139

print $out qq( "$case\\0" /* offset ${special_case_offsets[$i]} */\n);

$i++;

}

print $out <<EOT;

};

EOT

print STDERR "Generated " . ($special_case_offset + 1) . " bytes in special case table\n";

1149

}

1150

1151

sub enumerate_ordered

{

my ($array) = @_;

my $n = 0;

for my $code (sort { $a <=> $b } keys %$array) {

1157

if ($array->{$code} == 1) {

1158

delete $array->{$code};

1159

next;

1160

}

1161

$array->{$code} = $n++;

}

return $n;

}

sub output_composition_table

1168

{

1169

print STDERR "Generating composition table\n";

1170

1171

local ($bytes_out) = 0;

my %first;

my %second;

# First we need to go through and remove decompositions

1177

# starting with a non-starter, and single-character

1178

# decompositions. At the same time, record

1179

# the first and second character of each decomposition

1180

1181

for $code (keys %compositions)

1182

{

1183

@values = map { hex ($_) } split /\s+/, $compositions{$code};

1184

1185

# non-starters

1186

if ($cclass[$code]) {

1187

delete $compositions{$code};

1188

next;

1189

}

1190

if ($cclass[$values[0]]) {

1191

delete $compositions{$code};

next;

}

# single-character decompositions

1196

if (@values == 1) {

1197

delete $compositions{$code};

next;

}

if (@values != 2) {

1202

die "$code has more than two elements in its decomposition!\n";

1203

}

1204

1205

if (exists $first{$values[0]}) {

1206

$first{$values[0]}++;

1207

} else {

1208

$first{$values[0]} = 1;

}

}

# Assign integer indices, removing singletons

1213

my $n_first = enumerate_ordered (\%first);

1214

1215

# Now record the second character of each (non-singleton) decomposition

1216

for $code (keys %compositions) {

1217

@values = map { hex ($_) } split /\s+/, $compositions{$code};

1218

1219

if (exists $first{$values[0]}) {

1220

if (exists $second{$values[1]}) {

1221

$second{$values[1]}++;

1222

} else {

1223

$second{$values[1]} = 1;

}

}

}

# Assign integer indices, removing duplicate

1229

my $n_second = enumerate_ordered (\%second);

1230

1231

# Build reverse table

1232

1233

my @first_singletons;

1234

my @second_singletons;

1235

my %reverse;

1236

for $code (keys %compositions) {

1237

@values = map { hex ($_) } split /\s+/, $compositions{$code};

1238

1239

my $first = $first{$values[0]};

1240

my $second = $second{$values[1]};

1241

1242

if (defined $first && defined $second) {

1243

$reverse{"$first|$second"} = $code;

1244

} elsif (!defined $first) {

1245

push @first_singletons, [ $values[0], $values[1], $code ];

1246

} else {

1247

push @second_singletons, [ $values[1], $values[0], $code ];

}

}

@first_singletons = sort { $a->[0] <=> $b->[0] } @first_singletons;

1252

@second_singletons = sort { $a->[0] <=> $b->[0] } @second_singletons;

my %vals;

open OUT, ">gunicomp.h" or die "Cannot open gunicomp.h: $!\n";

1257

1258

# Assign values in lookup table for all code points involved

my $total = 1;

my $last = 0;

printf OUT "#define COMPOSE_FIRST_START %d\n", $total;

1263

for $code (keys %first) {

1264

$vals{$code} = $first{$code} + $total;

1265

$last = $code if $code > $last;

1266

}

1267

$total += $n_first;

1268

$i = 0;

1269

printf OUT "#define COMPOSE_FIRST_SINGLE_START %d\n", $total;

1270

for $record (@first_singletons) {

1271

my $code = $record->[0];

1272

$vals{$code} = $i++ + $total;

1273

$last = $code if $code > $last;

1274

}

1275

$total += @first_singletons;

1276

printf OUT "#define COMPOSE_SECOND_START %d\n", $total;

1277

for $code (keys %second) {

1278

$vals{$code} = $second{$code} + $total;

1279

$last = $code if $code > $last;

1280

}

1281

$total += $n_second;

1282

$i = 0;

1283

printf OUT "#define COMPOSE_SECOND_SINGLE_START %d\n\n", $total;

1284

for $record (@second_singletons) {

1285

my $code = $record->[0];

1286

$vals{$code} = $i++ + $total;

1287

$last = $code if $code > $last;

1288

}

1289

1290

printf OUT "#define COMPOSE_TABLE_LAST %d\n\n", $last / 256;

1291

1292

# Output lookup table

my @row;

$table_index = 0;

printf OUT "static const guint16 compose_data[][256] = {\n";

1297

for (my $count = 0; $count <= $last; $count += 256)

1298

{

1299

$row[$count / 256] = &print_row ($count, 2, sub { exists $vals{$_[0]} ? $vals{$_[0]} : 0; });

1300

}

1301

printf OUT "\n};\n\n";

1302

1303

print OUT "static const gint16 compose_table[COMPOSE_TABLE_LAST + 1] = {\n";

1304

for (my $count = 0; $count <= $last; $count += 256)

1305

{

1306

print OUT ",\n" if $count > 0;

1307

print OUT " ", $row[$count / 256];

1308

$bytes_out += 2;

1309

}

1310

print OUT "\n};\n\n";

1311

1312

# Output first singletons

1313

1314

print OUT "static const gunichar compose_first_single[][2] = {\n";

1315

$i = 0;

1316

for $record (@first_singletons) {

1317

print OUT ",\n" if $i++ > 0;

1318

printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];

1319

}

1320

print OUT "\n};\n";

1321

1322

$bytes_out += @first_singletons * 4;

1323

1324

# Output second singletons

1325

1326

print OUT "static const gunichar compose_second_single[][2] = {\n";

1327

$i = 0;

1328

for $record (@second_singletons) {

1329

print OUT ",\n" if $i++ > 0;

1330

printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];

1331

}

1332

print OUT "\n};\n";

1333

1334

$bytes_out += @second_singletons * 4;

1335

1336

# Output array of composition pairs

1337

1338

print OUT <<EOT;

1339

static const guint16 compose_array[$n_first][$n_second] = {

1340

EOT

1341

1342

for (my $i = 0; $i < $n_first; $i++) {

1343

print OUT ",\n" if $i;

1344

print OUT " { ";

1345

for (my $j = 0; $j < $n_second; $j++) {

1346

print OUT ", " if $j;

1347

if (exists $reverse{"$i|$j"}) {

1348

if ($reverse{"$i|$j"} > 0xFFFF) {

1349

die "time to switch compose_array to gunichar" ;

1350

}

1351

printf OUT "0x%04x", $reverse{"$i|$j"};

} else {

print OUT " 0";

}

}

print OUT " }";

}

print OUT "\n";

print OUT <<EOT;

};

EOT

$bytes_out += $n_first * $n_second * 2;

1365

1366

printf STDERR "Generated %d bytes in compose tables\n", $bytes_out;

1367

}

1368

1369

sub output_casefold_table

{

my $out = shift;

print $out <<EOT;

/* Table of casefolding cases that can't be derived by lowercasing

1376

*/

1377

static const struct {

1378

guint16 ch;

1379

gchar data[$casefoldlen];

1380

} casefold_table[] = {

1381

EOT

1382

1383

@casefold = sort { $a->[0] <=> $b->[0] } @casefold;

1384

1385

for $case (@casefold)

1386

{

1387

$code = $case->[0];

1388

$string = $case->[1];

1389

1390

if ($code > 0xFFFF) {

1391

die "time to switch casefold_table to gunichar" ;

1392

}

1393

1394

print $out sprintf(qq( { 0x%04x, "$string" },\n), $code);

}

print $out <<EOT;

};

EOT

my $recordlen = (2+$casefoldlen+1) & ~1;

1404

printf "Generated %d bytes for casefold table\n", $recordlen * @casefold;

1405

}

1406

1407

sub output_one_width_table

1408

{

1409

my ($out, $name, $wpe) = @_;

my $start;

my $end;

my $wp;

my $rex;

print $out "static const struct Interval g_unicode_width_table_${name}[] = {\n";

$rex = qr/$wpe/;

for (my $i = 0; $i <= $#eawidths; $i++) {

1420

$start = $eawidths[$i]->[0];

1421

$end = $eawidths[$i]->[1];

1422

$wp = $eawidths[$i]->[2];

1423

1424

next if ($wp !~ $rex);

1425

1426

while ($i <= $#eawidths - 1 &&

1427

$eawidths[$i + 1]->[0] == $end + 1 &&

1428

($eawidths[$i + 1]->[2] =~ $rex)) {

1429

$i++;

1430

$end = $eawidths[$i]->[1];

1431

}

1432

1433

printf $out "{0x%04X, 0x%04X},\n", $start, $end;

1434

}

1435

1436

printf $out "};\n\n";

1437

}

1438

1439

sub output_width_tables

{

my $out = shift;

@eawidths = sort { $a->[0] <=> $b->[0] } @eawidths;

print $out <<EOT;

struct Interval

{

gunichar start, end;

};

EOT

&output_one_width_table ($out,"wide", "[FW]");

1455

&output_one_width_table ($out, "ambiguous", "[A]");

}

sub print_scripts

{

my $start;

my $end;

my $script;

my $easy_range;

my $i;

print STDERR "Writing gscripttable.h\n";

1467

1468

open OUT, ">gscripttable.h" or die "Cannot open gscripttable.h: $!\n";

1469

1470

print OUT<<EOT;

1471

/* This file is automatically generated. DO NOT EDIT!

1472

Instead, edit gen-unicode-tables.pl and re-run. */

1473

1474

#ifndef SCRIPTTABLES_H

1475

#define SCRIPTTABLES_H

EOT

@scripts = sort { $a->[0] <=> $b->[0] } @scripts;

1480

1481

$easy_range = 0x2000;

1482

1483

print OUT<<EOT;

1484

#define G_EASY_SCRIPTS_RANGE $easy_range

1485

1486

static const guchar g_script_easy_table[$easy_range] = {

EOT

$i = 0;

$end = -1;

for (my $c = 0; $c < $easy_range; $c++) {

if ($c % 3 == 0) {

printf OUT "\n ";

}

if ($c > $end) {

$start = $scripts[$i]->[0];

1500

$end = $scripts[$i]->[1];

1501

$script = $scripts[$i]->[2];

$i++;

}

if ($c < $start) {

printf OUT " G_UNICODE_SCRIPT_UNKNOWN,";

1507

} else {

1508

printf OUT " G_UNICODE_SCRIPT_%s,", $script;

}

}

if ($end >= $easy_range) {

1513

$i--;

1514

$scripts[$i]->[0] = $easy_range;

}

print OUT<<EOT;

};

static const struct {

gunichar start;

guint16 chars;

guint16 script;

} g_script_table[] = {

1526

EOT

1527

1528

for (; $i <= $#scripts; $i++) {

1529

$start = $scripts[$i]->[0];

1530

$end = $scripts[$i]->[1];

1531

$script = $scripts[$i]->[2];

1532

1533

while ($i <= $#scripts - 1 &&

1534

$scripts[$i + 1]->[0] == $end + 1 &&

1535

$scripts[$i + 1]->[2] eq $script) {

1536

$i++;

1537

$end = $scripts[$i]->[1];

1538

}

1539

printf OUT " { %#06x, %5d, G_UNICODE_SCRIPT_%s },\n", $start, $end - $start + 1, $script;

}

printf OUT<<EOT;

};

#endif /* SCRIPTTABLES_H */

EOT

close OUT;

}

nexmon – Blame information for rev 1