Context Navigation

source: bootcd/isolinux/syslinux-6.03/codepage/cptable.pl

Last change on this file was e16e8f2, checked in by Edwin Eefting <edwin@datux.nl>, 3 years ago
bootstuff
Property mode set to `100755`
File size: 4.7 KB

Rev	Line
[e16e8f2]	1	#!/usr/bin/perl
	2	#
	3	# Produce a codepage matching table. For each 8-bit character, list
	4	# a primary and an alternate match (the latter used for case-insensitive
	5	# matching.)
	6	#
	7	# Usage:
	8	# cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp
	9	#
	10	# Note: for the format of the UnicodeData file, see:
	11	# http://www.unicode.org/Public/UNIDATA/UCD.html
	12	#
	13
	14	($ucd, $cpco, $cpfs, $cpout) = @ARGV;
	15
	16	if (!defined($cpout)) {
	17	die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n";
	18	}
	19
	20	%ucase = ();
	21	%lcase = ();
	22	%tcase = ();
	23	%decomp = ();
	24
	25	open(UCD, '<', $ucd)
	26	or die "$0: could not open unicode data: $ucd: $!\n";
	27	while (defined($line = <UCD>)) {
	28	chomp $line;
	29	@f = split(/;/, $line);
	30	$n = hex $f[0];
	31	$ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n;
	32	$lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n;
	33	$tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n;
	34	if ($f[5] =~ /^[0-9A-F\s]+$/) {
	35	# This character has a canonical decomposition.
	36	# The regular expression rejects angle brackets, so other
	37	# decompositions aren't permitted.
	38	$decomp{$n} = [];
	39	foreach my $dch (split(' ', $f[5])) {
	40	push(@{$decomp{$n}}, hex $dch);
	41	}
	42	}
	43	}
	44	close(UCD);
	45
	46	#
	47	# Filesystem and console codepages. The filesystem codepage is used
	48	# for FAT shortnames, whereas the console codepage is whatever is used
	49	# on the screen and keyboard.
	50	#
	51	@xtab = (undef) x 256;
	52	%tabx = ();
	53	open(CPFS, '<', $cpfs)
	54	or die "$0: could not open fs codepage: $cpfs: $!\n";
	55	while (defined($line = <CPFS>)) {
	56	$line =~ s/\s(\#.\|)$//;
	57	@f = split(/\s+/, $line);
	58	next if (scalar @f != 2);
	59	next if (hex $f[0] > 255);
	60	$xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
	61	$tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
	62	}
	63	close(CPFS);
	64
	65	@ytab = (undef) x 256;
	66	%taby = ();
	67	open(CPCO, '<', $cpco)
	68	or die "$0: could not open console codepage: $cpco: $!\n";
	69	while (defined($line = <CPCO>)) {
	70	$line =~ s/\s(\#.\|)$//;
	71	@f = split(/\s+/, $line);
	72	next if (scalar @f != 2);
	73	next if (hex $f[0] > 255);
	74	$ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
	75	$taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
	76	}
	77	close(CPCO);
	78
	79	open(CPOUT, '>', $cpout)
	80	or die "$0: could not open output file: $cpout: $!\n";
	81	#
	82	# Magic number, in anticipation of being able to load these
	83	# files dynamically...
	84	#
	85	print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1);
	86
	87	# Header fields available for future use...
	88	print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0);
	89
	90	#
	91	# Self (shortname) uppercase table.
	92	# This depends both on the console codepage and the filesystem codepage;
	93	# the logical transcoding operation is:
	94	#
	95	# $tabx{$ucase{$ytab[$i]}}
	96	#
	97	# ... where @ytab is console codepage -> Unicode and
	98	# %tabx is Unicode -> filesystem codepage.
	99	#
	100	@uctab = (undef) x 256;
	101	for ($i = 0; $i < 256; $i++) {
	102	$uuc = $ucase{$ytab[$i]}; # Unicode upper case
	103	if (defined($tabx{$uuc})) {
	104	# Straight-forward conversion
	105	$u = $tabx{$uuc};
	106	} elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
	107	# Upper case equivalent stripped of accents
	108	$u = $tabx{${$decomp{$uuc}}[0]};
	109	} else {
	110	# No equivalent at all found. Assume it is a lower-case-only
	111	# character, like greek alpha in CP437.
	112	$u = $i;
	113	}
	114	$uctab[$i] = $u;
	115	print CPOUT pack("C", $u);
	116	}
	117
	118	#
	119	# Self (shortname) lowercase table.
	120	# This depends both on the console codepage and the filesystem codepage;
	121	# the logical transcoding operation is:
	122	#
	123	# $taby{$lcase{$xtab[$i]}}
	124	#
	125	# ... where @ytab is console codepage -> Unicode and
	126	# %tabx is Unicode -> filesystem codepage.
	127	#
	128	@lctab = (undef) x 256;
	129	for ($i = 0; $i < 256; $i++) {
	130	$llc = $lcase{$xtab[$i]}; # Unicode lower case
	131	if (defined($l = $taby{$llc}) && $uctab[$l] == $i) {
	132	# Straight-forward conversion
	133	} elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) {
	134	# Lower case equivalent stripped of accents
	135	} else {
	136	# No equivalent at all found. Find anything that matches the
	137	# bijection criterion...
	138	for ($l = 0; $l < 256; $l++) {
	139	last if ($uctab[$l] == $i);
	140	}
	141	$l = $i if ($l == 256); # If nothing, we're screwed anyway...
	142	}
	143	$lctab[$i] = $l;
	144	print CPOUT pack("C", $l);
	145	}
	146
	147	#
	148	# Unicode (longname) matching table.
	149	# This only depends on the console codepage.
	150	#
	151	$pp0 = ''; $pp1 = '';
	152	for ($i = 0; $i < 256; $i++) {
	153	if (!defined($ytab[$i])) {
	154	$p0 = $p1 = 0xffff;
	155	} else {
	156	$p0 = $ytab[$i];
	157	if ($ucase{$p0} != $p0) {
	158	$p1 = $ucase{$p0};
	159	} elsif ($lcase{$p0} != $p0) {
	160	$p1 = $lcase{$p0};
	161	} elsif ($tcase{$p0} != $p0) {
	162	$p1 = $tcase{$p0};
	163	} else {
	164	$p1 = $p0;
	165	}
	166	}
	167	# Only the BMP is supported...
	168	$p0 = 0xffff if ($p0 > 0xffff);
	169	$p1 = 0xffff if ($p1 > 0xffff);
	170	$pp0 .= pack("v", $p0);
	171	$pp1 .= pack("v", $p1);
	172	}
	173	print CPOUT $pp0, $pp1;
	174	close (CPOUT);
	175
	176

Note: See TracBrowser for help on using the repository browser.

Download in other formats: