[e16e8f2] | 1 | #!/usr/bin/perl |
---|
| 2 | # |
---|
| 3 | # Produce a codepage matching table. For each 8-bit character, list |
---|
| 4 | # a primary and an alternate match (the latter used for case-insensitive |
---|
| 5 | # matching.) |
---|
| 6 | # |
---|
| 7 | # Usage: |
---|
| 8 | # cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp |
---|
| 9 | # |
---|
| 10 | # Note: for the format of the UnicodeData file, see: |
---|
| 11 | # http://www.unicode.org/Public/UNIDATA/UCD.html |
---|
| 12 | # |
---|
| 13 | |
---|
| 14 | ($ucd, $cpco, $cpfs, $cpout) = @ARGV; |
---|
| 15 | |
---|
| 16 | if (!defined($cpout)) { |
---|
| 17 | die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n"; |
---|
| 18 | } |
---|
| 19 | |
---|
| 20 | %ucase = (); |
---|
| 21 | %lcase = (); |
---|
| 22 | %tcase = (); |
---|
| 23 | %decomp = (); |
---|
| 24 | |
---|
| 25 | open(UCD, '<', $ucd) |
---|
| 26 | or die "$0: could not open unicode data: $ucd: $!\n"; |
---|
| 27 | while (defined($line = <UCD>)) { |
---|
| 28 | chomp $line; |
---|
| 29 | @f = split(/;/, $line); |
---|
| 30 | $n = hex $f[0]; |
---|
| 31 | $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n; |
---|
| 32 | $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n; |
---|
| 33 | $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n; |
---|
| 34 | if ($f[5] =~ /^[0-9A-F\s]+$/) { |
---|
| 35 | # This character has a canonical decomposition. |
---|
| 36 | # The regular expression rejects angle brackets, so other |
---|
| 37 | # decompositions aren't permitted. |
---|
| 38 | $decomp{$n} = []; |
---|
| 39 | foreach my $dch (split(' ', $f[5])) { |
---|
| 40 | push(@{$decomp{$n}}, hex $dch); |
---|
| 41 | } |
---|
| 42 | } |
---|
| 43 | } |
---|
| 44 | close(UCD); |
---|
| 45 | |
---|
| 46 | # |
---|
| 47 | # Filesystem and console codepages. The filesystem codepage is used |
---|
| 48 | # for FAT shortnames, whereas the console codepage is whatever is used |
---|
| 49 | # on the screen and keyboard. |
---|
| 50 | # |
---|
| 51 | @xtab = (undef) x 256; |
---|
| 52 | %tabx = (); |
---|
| 53 | open(CPFS, '<', $cpfs) |
---|
| 54 | or die "$0: could not open fs codepage: $cpfs: $!\n"; |
---|
| 55 | while (defined($line = <CPFS>)) { |
---|
| 56 | $line =~ s/\s*(\#.*|)$//; |
---|
| 57 | @f = split(/\s+/, $line); |
---|
| 58 | next if (scalar @f != 2); |
---|
| 59 | next if (hex $f[0] > 255); |
---|
| 60 | $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode |
---|
| 61 | $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage |
---|
| 62 | } |
---|
| 63 | close(CPFS); |
---|
| 64 | |
---|
| 65 | @ytab = (undef) x 256; |
---|
| 66 | %taby = (); |
---|
| 67 | open(CPCO, '<', $cpco) |
---|
| 68 | or die "$0: could not open console codepage: $cpco: $!\n"; |
---|
| 69 | while (defined($line = <CPCO>)) { |
---|
| 70 | $line =~ s/\s*(\#.*|)$//; |
---|
| 71 | @f = split(/\s+/, $line); |
---|
| 72 | next if (scalar @f != 2); |
---|
| 73 | next if (hex $f[0] > 255); |
---|
| 74 | $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode |
---|
| 75 | $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage |
---|
| 76 | } |
---|
| 77 | close(CPCO); |
---|
| 78 | |
---|
| 79 | open(CPOUT, '>', $cpout) |
---|
| 80 | or die "$0: could not open output file: $cpout: $!\n"; |
---|
| 81 | # |
---|
| 82 | # Magic number, in anticipation of being able to load these |
---|
| 83 | # files dynamically... |
---|
| 84 | # |
---|
| 85 | print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1); |
---|
| 86 | |
---|
| 87 | # Header fields available for future use... |
---|
| 88 | print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0); |
---|
| 89 | |
---|
| 90 | # |
---|
| 91 | # Self (shortname) uppercase table. |
---|
| 92 | # This depends both on the console codepage and the filesystem codepage; |
---|
| 93 | # the logical transcoding operation is: |
---|
| 94 | # |
---|
| 95 | # $tabx{$ucase{$ytab[$i]}} |
---|
| 96 | # |
---|
| 97 | # ... where @ytab is console codepage -> Unicode and |
---|
| 98 | # %tabx is Unicode -> filesystem codepage. |
---|
| 99 | # |
---|
| 100 | @uctab = (undef) x 256; |
---|
| 101 | for ($i = 0; $i < 256; $i++) { |
---|
| 102 | $uuc = $ucase{$ytab[$i]}; # Unicode upper case |
---|
| 103 | if (defined($tabx{$uuc})) { |
---|
| 104 | # Straight-forward conversion |
---|
| 105 | $u = $tabx{$uuc}; |
---|
| 106 | } elsif (defined($tabx{${$decomp{$uuc}}[0]})) { |
---|
| 107 | # Upper case equivalent stripped of accents |
---|
| 108 | $u = $tabx{${$decomp{$uuc}}[0]}; |
---|
| 109 | } else { |
---|
| 110 | # No equivalent at all found. Assume it is a lower-case-only |
---|
| 111 | # character, like greek alpha in CP437. |
---|
| 112 | $u = $i; |
---|
| 113 | } |
---|
| 114 | $uctab[$i] = $u; |
---|
| 115 | print CPOUT pack("C", $u); |
---|
| 116 | } |
---|
| 117 | |
---|
| 118 | # |
---|
| 119 | # Self (shortname) lowercase table. |
---|
| 120 | # This depends both on the console codepage and the filesystem codepage; |
---|
| 121 | # the logical transcoding operation is: |
---|
| 122 | # |
---|
| 123 | # $taby{$lcase{$xtab[$i]}} |
---|
| 124 | # |
---|
| 125 | # ... where @ytab is console codepage -> Unicode and |
---|
| 126 | # %tabx is Unicode -> filesystem codepage. |
---|
| 127 | # |
---|
| 128 | @lctab = (undef) x 256; |
---|
| 129 | for ($i = 0; $i < 256; $i++) { |
---|
| 130 | $llc = $lcase{$xtab[$i]}; # Unicode lower case |
---|
| 131 | if (defined($l = $taby{$llc}) && $uctab[$l] == $i) { |
---|
| 132 | # Straight-forward conversion |
---|
| 133 | } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) { |
---|
| 134 | # Lower case equivalent stripped of accents |
---|
| 135 | } else { |
---|
| 136 | # No equivalent at all found. Find *anything* that matches the |
---|
| 137 | # bijection criterion... |
---|
| 138 | for ($l = 0; $l < 256; $l++) { |
---|
| 139 | last if ($uctab[$l] == $i); |
---|
| 140 | } |
---|
| 141 | $l = $i if ($l == 256); # If nothing, we're screwed anyway... |
---|
| 142 | } |
---|
| 143 | $lctab[$i] = $l; |
---|
| 144 | print CPOUT pack("C", $l); |
---|
| 145 | } |
---|
| 146 | |
---|
| 147 | # |
---|
| 148 | # Unicode (longname) matching table. |
---|
| 149 | # This only depends on the console codepage. |
---|
| 150 | # |
---|
| 151 | $pp0 = ''; $pp1 = ''; |
---|
| 152 | for ($i = 0; $i < 256; $i++) { |
---|
| 153 | if (!defined($ytab[$i])) { |
---|
| 154 | $p0 = $p1 = 0xffff; |
---|
| 155 | } else { |
---|
| 156 | $p0 = $ytab[$i]; |
---|
| 157 | if ($ucase{$p0} != $p0) { |
---|
| 158 | $p1 = $ucase{$p0}; |
---|
| 159 | } elsif ($lcase{$p0} != $p0) { |
---|
| 160 | $p1 = $lcase{$p0}; |
---|
| 161 | } elsif ($tcase{$p0} != $p0) { |
---|
| 162 | $p1 = $tcase{$p0}; |
---|
| 163 | } else { |
---|
| 164 | $p1 = $p0; |
---|
| 165 | } |
---|
| 166 | } |
---|
| 167 | # Only the BMP is supported... |
---|
| 168 | $p0 = 0xffff if ($p0 > 0xffff); |
---|
| 169 | $p1 = 0xffff if ($p1 > 0xffff); |
---|
| 170 | $pp0 .= pack("v", $p0); |
---|
| 171 | $pp1 .= pack("v", $p1); |
---|
| 172 | } |
---|
| 173 | print CPOUT $pp0, $pp1; |
---|
| 174 | close (CPOUT); |
---|
| 175 | |
---|
| 176 | |
---|