source: bootcd/isolinux/syslinux-6.03/codepage/cptable.pl @ e16e8f2

Last change on this file since e16e8f2 was e16e8f2, checked in by Edwin Eefting <edwin@datux.nl>, 3 years ago

bootstuff

  • Property mode set to 100755
File size: 4.7 KB
Line 
1#!/usr/bin/perl
2#
3# Produce a codepage matching table.  For each 8-bit character, list
4# a primary and an alternate match (the latter used for case-insensitive
5# matching.)
6#
7# Usage:
8#       cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp
9#
10# Note: for the format of the UnicodeData file, see:
11# http://www.unicode.org/Public/UNIDATA/UCD.html
12#
13
14($ucd, $cpco, $cpfs, $cpout) = @ARGV;
15
16if (!defined($cpout)) {
17    die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n";
18}
19
20%ucase   = ();
21%lcase   = ();
22%tcase   = ();
23%decomp  = ();
24
25open(UCD, '<', $ucd)
26    or die "$0: could not open unicode data: $ucd: $!\n";
27while (defined($line = <UCD>)) {
28    chomp $line;
29    @f = split(/;/, $line);
30    $n = hex $f[0];
31    $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n;
32    $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n;
33    $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n;
34    if ($f[5] =~ /^[0-9A-F\s]+$/) {
35        # This character has a canonical decomposition.
36        # The regular expression rejects angle brackets, so other
37        # decompositions aren't permitted.
38        $decomp{$n} = [];
39        foreach my $dch (split(' ', $f[5])) {
40            push(@{$decomp{$n}}, hex $dch);
41        }
42    }
43}
44close(UCD);
45
46#
47# Filesystem and console codepages.  The filesystem codepage is used
48# for FAT shortnames, whereas the console codepage is whatever is used
49# on the screen and keyboard.
50#
51@xtab = (undef) x 256;
52%tabx = ();
53open(CPFS, '<', $cpfs)
54    or die "$0: could not open fs codepage: $cpfs: $!\n";
55while (defined($line = <CPFS>)) {
56    $line =~ s/\s*(\#.*|)$//;
57    @f = split(/\s+/, $line);
58    next if (scalar @f != 2);
59    next if (hex $f[0] > 255);
60    $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
61    $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
62}
63close(CPFS);
64
65@ytab = (undef) x 256;
66%taby = ();
67open(CPCO, '<', $cpco)
68    or die "$0: could not open console codepage: $cpco: $!\n";
69while (defined($line = <CPCO>)) {
70    $line =~ s/\s*(\#.*|)$//;
71    @f = split(/\s+/, $line);
72    next if (scalar @f != 2);
73    next if (hex $f[0] > 255);
74    $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
75    $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
76}
77close(CPCO);
78
79open(CPOUT, '>', $cpout)
80    or die "$0: could not open output file: $cpout: $!\n";
81#
82# Magic number, in anticipation of being able to load these
83# files dynamically...
84#
85print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1);
86
87# Header fields available for future use...
88print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0);
89
90#
91# Self (shortname) uppercase table.
92# This depends both on the console codepage and the filesystem codepage;
93# the logical transcoding operation is:
94#
95# $tabx{$ucase{$ytab[$i]}}
96#
97# ... where @ytab is console codepage -> Unicode and
98# %tabx is Unicode -> filesystem codepage.
99#
100@uctab = (undef) x 256;
101for ($i = 0; $i < 256; $i++) {
102    $uuc = $ucase{$ytab[$i]};   # Unicode upper case
103    if (defined($tabx{$uuc})) {
104        # Straight-forward conversion
105        $u = $tabx{$uuc};
106    } elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
107        # Upper case equivalent stripped of accents
108        $u = $tabx{${$decomp{$uuc}}[0]};
109    } else {
110        # No equivalent at all found.  Assume it is a lower-case-only
111        # character, like greek alpha in CP437.
112        $u = $i;
113    }
114    $uctab[$i] = $u;
115    print CPOUT pack("C", $u);
116}
117
118#
119# Self (shortname) lowercase table.
120# This depends both on the console codepage and the filesystem codepage;
121# the logical transcoding operation is:
122#
123# $taby{$lcase{$xtab[$i]}}
124#
125# ... where @ytab is console codepage -> Unicode and
126# %tabx is Unicode -> filesystem codepage.
127#
128@lctab = (undef) x 256;
129for ($i = 0; $i < 256; $i++) {
130    $llc = $lcase{$xtab[$i]};   # Unicode lower case
131    if (defined($l = $taby{$llc}) && $uctab[$l] == $i) {
132        # Straight-forward conversion
133    } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) {
134        # Lower case equivalent stripped of accents
135    } else {
136        # No equivalent at all found.  Find *anything* that matches the
137        # bijection criterion...
138        for ($l = 0; $l < 256; $l++) {
139            last if ($uctab[$l] == $i);
140        }
141        $l = $i if ($l == 256); # If nothing, we're screwed anyway...
142    }
143    $lctab[$i] = $l;
144    print CPOUT pack("C", $l);
145}
146
147#
148# Unicode (longname) matching table.
149# This only depends on the console codepage.
150#
151$pp0 = '';  $pp1 = '';
152for ($i = 0; $i < 256; $i++) {
153    if (!defined($ytab[$i])) {
154        $p0 = $p1 = 0xffff;
155    } else {
156        $p0 = $ytab[$i];
157        if ($ucase{$p0} != $p0) {
158            $p1 = $ucase{$p0};
159        } elsif ($lcase{$p0} != $p0) {
160            $p1 = $lcase{$p0};
161        } elsif ($tcase{$p0} != $p0) {
162            $p1 = $tcase{$p0};
163        } else {
164            $p1 = $p0;
165        }
166    }
167    # Only the BMP is supported...
168    $p0 = 0xffff if ($p0 > 0xffff);
169    $p1 = 0xffff if ($p1 > 0xffff);
170    $pp0 .= pack("v", $p0);
171    $pp1 .= pack("v", $p1);
172}
173print CPOUT $pp0, $pp1;
174close (CPOUT);
175
176   
Note: See TracBrowser for help on using the repository browser.