Context Navigation

source: bootcd/isolinux/syslinux-6.03/codepage/cptable.pl

Last change on this file was e16e8f2, checked in by Edwin Eefting <edwin@datux.nl>, 3 years ago
bootstuff
Property mode set to `100755`
File size: 4.7 KB

Line
1	#!/usr/bin/perl
2	#
3	# Produce a codepage matching table. For each 8-bit character, list
4	# a primary and an alternate match (the latter used for case-insensitive
5	# matching.)
6	#
7	# Usage:
8	# cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp
9	#
10	# Note: for the format of the UnicodeData file, see:
11	# http://www.unicode.org/Public/UNIDATA/UCD.html
12	#
13
14	($ucd, $cpco, $cpfs, $cpout) = @ARGV;
15
16	if (!defined($cpout)) {
17	die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n";
18	}
19
20	%ucase = ();
21	%lcase = ();
22	%tcase = ();
23	%decomp = ();
24
25	open(UCD, '<', $ucd)
26	or die "$0: could not open unicode data: $ucd: $!\n";
27	while (defined($line = <UCD>)) {
28	chomp $line;
29	@f = split(/;/, $line);
30	$n = hex $f[0];
31	$ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n;
32	$lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n;
33	$tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n;
34	if ($f[5] =~ /^[0-9A-F\s]+$/) {
35	# This character has a canonical decomposition.
36	# The regular expression rejects angle brackets, so other
37	# decompositions aren't permitted.
38	$decomp{$n} = [];
39	foreach my $dch (split(' ', $f[5])) {
40	push(@{$decomp{$n}}, hex $dch);
41	}
42	}
43	}
44	close(UCD);
45
46	#
47	# Filesystem and console codepages. The filesystem codepage is used
48	# for FAT shortnames, whereas the console codepage is whatever is used
49	# on the screen and keyboard.
50	#
51	@xtab = (undef) x 256;
52	%tabx = ();
53	open(CPFS, '<', $cpfs)
54	or die "$0: could not open fs codepage: $cpfs: $!\n";
55	while (defined($line = <CPFS>)) {
56	$line =~ s/\s(\#.\|)$//;
57	@f = split(/\s+/, $line);
58	next if (scalar @f != 2);
59	next if (hex $f[0] > 255);
60	$xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
61	$tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
62	}
63	close(CPFS);
64
65	@ytab = (undef) x 256;
66	%taby = ();
67	open(CPCO, '<', $cpco)
68	or die "$0: could not open console codepage: $cpco: $!\n";
69	while (defined($line = <CPCO>)) {
70	$line =~ s/\s(\#.\|)$//;
71	@f = split(/\s+/, $line);
72	next if (scalar @f != 2);
73	next if (hex $f[0] > 255);
74	$ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
75	$taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
76	}
77	close(CPCO);
78
79	open(CPOUT, '>', $cpout)
80	or die "$0: could not open output file: $cpout: $!\n";
81	#
82	# Magic number, in anticipation of being able to load these
83	# files dynamically...
84	#
85	print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1);
86
87	# Header fields available for future use...
88	print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0);
89
90	#
91	# Self (shortname) uppercase table.
92	# This depends both on the console codepage and the filesystem codepage;
93	# the logical transcoding operation is:
94	#
95	# $tabx{$ucase{$ytab[$i]}}
96	#
97	# ... where @ytab is console codepage -> Unicode and
98	# %tabx is Unicode -> filesystem codepage.
99	#
100	@uctab = (undef) x 256;
101	for ($i = 0; $i < 256; $i++) {
102	$uuc = $ucase{$ytab[$i]}; # Unicode upper case
103	if (defined($tabx{$uuc})) {
104	# Straight-forward conversion
105	$u = $tabx{$uuc};
106	} elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
107	# Upper case equivalent stripped of accents
108	$u = $tabx{${$decomp{$uuc}}[0]};
109	} else {
110	# No equivalent at all found. Assume it is a lower-case-only
111	# character, like greek alpha in CP437.
112	$u = $i;
113	}
114	$uctab[$i] = $u;
115	print CPOUT pack("C", $u);
116	}
117
118	#
119	# Self (shortname) lowercase table.
120	# This depends both on the console codepage and the filesystem codepage;
121	# the logical transcoding operation is:
122	#
123	# $taby{$lcase{$xtab[$i]}}
124	#
125	# ... where @ytab is console codepage -> Unicode and
126	# %tabx is Unicode -> filesystem codepage.
127	#
128	@lctab = (undef) x 256;
129	for ($i = 0; $i < 256; $i++) {
130	$llc = $lcase{$xtab[$i]}; # Unicode lower case
131	if (defined($l = $taby{$llc}) && $uctab[$l] == $i) {
132	# Straight-forward conversion
133	} elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) {
134	# Lower case equivalent stripped of accents
135	} else {
136	# No equivalent at all found. Find anything that matches the
137	# bijection criterion...
138	for ($l = 0; $l < 256; $l++) {
139	last if ($uctab[$l] == $i);
140	}
141	$l = $i if ($l == 256); # If nothing, we're screwed anyway...
142	}
143	$lctab[$i] = $l;
144	print CPOUT pack("C", $l);
145	}
146
147	#
148	# Unicode (longname) matching table.
149	# This only depends on the console codepage.
150	#
151	$pp0 = ''; $pp1 = '';
152	for ($i = 0; $i < 256; $i++) {
153	if (!defined($ytab[$i])) {
154	$p0 = $p1 = 0xffff;
155	} else {
156	$p0 = $ytab[$i];
157	if ($ucase{$p0} != $p0) {
158	$p1 = $ucase{$p0};
159	} elsif ($lcase{$p0} != $p0) {
160	$p1 = $lcase{$p0};
161	} elsif ($tcase{$p0} != $p0) {
162	$p1 = $tcase{$p0};
163	} else {
164	$p1 = $p0;
165	}
166	}
167	# Only the BMP is supported...
168	$p0 = 0xffff if ($p0 > 0xffff);
169	$p1 = 0xffff if ($p1 > 0xffff);
170	$pp0 .= pack("v", $p0);
171	$pp1 .= pack("v", $p1);
172	}
173	print CPOUT $pp0, $pp1;
174	close (CPOUT);
175
176

Note: See TracBrowser for help on using the repository browser.

Download in other formats: