1 | #!/usr/bin/perl |
---|
2 | # |
---|
3 | # Produce a codepage matching table. For each 8-bit character, list |
---|
4 | # a primary and an alternate match (the latter used for case-insensitive |
---|
5 | # matching.) |
---|
6 | # |
---|
7 | # Usage: |
---|
8 | # cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp |
---|
9 | # |
---|
10 | # Note: for the format of the UnicodeData file, see: |
---|
11 | # http://www.unicode.org/Public/UNIDATA/UCD.html |
---|
12 | # |
---|
13 | |
---|
14 | ($ucd, $cpco, $cpfs, $cpout) = @ARGV; |
---|
15 | |
---|
16 | if (!defined($cpout)) { |
---|
17 | die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n"; |
---|
18 | } |
---|
19 | |
---|
20 | %ucase = (); |
---|
21 | %lcase = (); |
---|
22 | %tcase = (); |
---|
23 | %decomp = (); |
---|
24 | |
---|
25 | open(UCD, '<', $ucd) |
---|
26 | or die "$0: could not open unicode data: $ucd: $!\n"; |
---|
27 | while (defined($line = <UCD>)) { |
---|
28 | chomp $line; |
---|
29 | @f = split(/;/, $line); |
---|
30 | $n = hex $f[0]; |
---|
31 | $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n; |
---|
32 | $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n; |
---|
33 | $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n; |
---|
34 | if ($f[5] =~ /^[0-9A-F\s]+$/) { |
---|
35 | # This character has a canonical decomposition. |
---|
36 | # The regular expression rejects angle brackets, so other |
---|
37 | # decompositions aren't permitted. |
---|
38 | $decomp{$n} = []; |
---|
39 | foreach my $dch (split(' ', $f[5])) { |
---|
40 | push(@{$decomp{$n}}, hex $dch); |
---|
41 | } |
---|
42 | } |
---|
43 | } |
---|
44 | close(UCD); |
---|
45 | |
---|
46 | # |
---|
47 | # Filesystem and console codepages. The filesystem codepage is used |
---|
48 | # for FAT shortnames, whereas the console codepage is whatever is used |
---|
49 | # on the screen and keyboard. |
---|
50 | # |
---|
51 | @xtab = (undef) x 256; |
---|
52 | %tabx = (); |
---|
53 | open(CPFS, '<', $cpfs) |
---|
54 | or die "$0: could not open fs codepage: $cpfs: $!\n"; |
---|
55 | while (defined($line = <CPFS>)) { |
---|
56 | $line =~ s/\s*(\#.*|)$//; |
---|
57 | @f = split(/\s+/, $line); |
---|
58 | next if (scalar @f != 2); |
---|
59 | next if (hex $f[0] > 255); |
---|
60 | $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode |
---|
61 | $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage |
---|
62 | } |
---|
63 | close(CPFS); |
---|
64 | |
---|
65 | @ytab = (undef) x 256; |
---|
66 | %taby = (); |
---|
67 | open(CPCO, '<', $cpco) |
---|
68 | or die "$0: could not open console codepage: $cpco: $!\n"; |
---|
69 | while (defined($line = <CPCO>)) { |
---|
70 | $line =~ s/\s*(\#.*|)$//; |
---|
71 | @f = split(/\s+/, $line); |
---|
72 | next if (scalar @f != 2); |
---|
73 | next if (hex $f[0] > 255); |
---|
74 | $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode |
---|
75 | $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage |
---|
76 | } |
---|
77 | close(CPCO); |
---|
78 | |
---|
79 | open(CPOUT, '>', $cpout) |
---|
80 | or die "$0: could not open output file: $cpout: $!\n"; |
---|
81 | # |
---|
82 | # Magic number, in anticipation of being able to load these |
---|
83 | # files dynamically... |
---|
84 | # |
---|
85 | print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1); |
---|
86 | |
---|
87 | # Header fields available for future use... |
---|
88 | print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0); |
---|
89 | |
---|
90 | # |
---|
91 | # Self (shortname) uppercase table. |
---|
92 | # This depends both on the console codepage and the filesystem codepage; |
---|
93 | # the logical transcoding operation is: |
---|
94 | # |
---|
95 | # $tabx{$ucase{$ytab[$i]}} |
---|
96 | # |
---|
97 | # ... where @ytab is console codepage -> Unicode and |
---|
98 | # %tabx is Unicode -> filesystem codepage. |
---|
99 | # |
---|
100 | @uctab = (undef) x 256; |
---|
101 | for ($i = 0; $i < 256; $i++) { |
---|
102 | $uuc = $ucase{$ytab[$i]}; # Unicode upper case |
---|
103 | if (defined($tabx{$uuc})) { |
---|
104 | # Straight-forward conversion |
---|
105 | $u = $tabx{$uuc}; |
---|
106 | } elsif (defined($tabx{${$decomp{$uuc}}[0]})) { |
---|
107 | # Upper case equivalent stripped of accents |
---|
108 | $u = $tabx{${$decomp{$uuc}}[0]}; |
---|
109 | } else { |
---|
110 | # No equivalent at all found. Assume it is a lower-case-only |
---|
111 | # character, like greek alpha in CP437. |
---|
112 | $u = $i; |
---|
113 | } |
---|
114 | $uctab[$i] = $u; |
---|
115 | print CPOUT pack("C", $u); |
---|
116 | } |
---|
117 | |
---|
118 | # |
---|
119 | # Self (shortname) lowercase table. |
---|
120 | # This depends both on the console codepage and the filesystem codepage; |
---|
121 | # the logical transcoding operation is: |
---|
122 | # |
---|
123 | # $taby{$lcase{$xtab[$i]}} |
---|
124 | # |
---|
125 | # ... where @ytab is console codepage -> Unicode and |
---|
126 | # %tabx is Unicode -> filesystem codepage. |
---|
127 | # |
---|
128 | @lctab = (undef) x 256; |
---|
129 | for ($i = 0; $i < 256; $i++) { |
---|
130 | $llc = $lcase{$xtab[$i]}; # Unicode lower case |
---|
131 | if (defined($l = $taby{$llc}) && $uctab[$l] == $i) { |
---|
132 | # Straight-forward conversion |
---|
133 | } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) { |
---|
134 | # Lower case equivalent stripped of accents |
---|
135 | } else { |
---|
136 | # No equivalent at all found. Find *anything* that matches the |
---|
137 | # bijection criterion... |
---|
138 | for ($l = 0; $l < 256; $l++) { |
---|
139 | last if ($uctab[$l] == $i); |
---|
140 | } |
---|
141 | $l = $i if ($l == 256); # If nothing, we're screwed anyway... |
---|
142 | } |
---|
143 | $lctab[$i] = $l; |
---|
144 | print CPOUT pack("C", $l); |
---|
145 | } |
---|
146 | |
---|
147 | # |
---|
148 | # Unicode (longname) matching table. |
---|
149 | # This only depends on the console codepage. |
---|
150 | # |
---|
151 | $pp0 = ''; $pp1 = ''; |
---|
152 | for ($i = 0; $i < 256; $i++) { |
---|
153 | if (!defined($ytab[$i])) { |
---|
154 | $p0 = $p1 = 0xffff; |
---|
155 | } else { |
---|
156 | $p0 = $ytab[$i]; |
---|
157 | if ($ucase{$p0} != $p0) { |
---|
158 | $p1 = $ucase{$p0}; |
---|
159 | } elsif ($lcase{$p0} != $p0) { |
---|
160 | $p1 = $lcase{$p0}; |
---|
161 | } elsif ($tcase{$p0} != $p0) { |
---|
162 | $p1 = $tcase{$p0}; |
---|
163 | } else { |
---|
164 | $p1 = $p0; |
---|
165 | } |
---|
166 | } |
---|
167 | # Only the BMP is supported... |
---|
168 | $p0 = 0xffff if ($p0 > 0xffff); |
---|
169 | $p1 = 0xffff if ($p1 > 0xffff); |
---|
170 | $pp0 .= pack("v", $p0); |
---|
171 | $pp1 .= pack("v", $p1); |
---|
172 | } |
---|
173 | print CPOUT $pp0, $pp1; |
---|
174 | close (CPOUT); |
---|
175 | |
---|
176 | |
---|