source: bootcd/isolinux/syslinux-6.03/codepage/gensubset.pl

Last change on this file was e16e8f2, checked in by Edwin Eefting <edwin@datux.nl>, 3 years ago

bootstuff

  • Property mode set to 100755
File size: 1.4 KB
RevLine 
[e16e8f2]1#!/usr/bin/perl
2#
3# Generate a subset of the UnicodeData.txt file, available from
4# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
5#
6# Usage:
7#   gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt
8#
9
10%need_these = ();
11
12# Mark as needed all the characters mentioned in the relevant files
13foreach $file (@ARGV) {
14    open(F, '<', $file) or die;
15    while (defined($line = <F>)) {
16        $line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks
17        @f = split(/\s+/, $line);
18        next if (scalar @f != 2);
19        $need_these{hex $f[1]}++;
20    }
21    close(F);
22}
23
24# Also mark as needed any case variants of those
25# (Note: this doesn't necessarily provide the full transitive closure,
26# but we shouldn't need it.)
27while (defined($line = <STDIN>)) {
28    @f = split(/;/, $line);
29    if ($f[0] =~ /^([0-9a-f]+)$/i) {
30        $r = hex $f[0];
31        if ($need_these{$r}) {
32            $need_these{hex $f[12]}++ if ($f[12] ne '');
33            $need_these{hex $f[13]}++ if ($f[13] ne '');
34            $need_these{hex $f[14]}++ if ($f[14] ne '');
35        }
36    }
37}
38
39# Finally, write out the subset
40seek(STDIN, 0, 0);
41while (defined($line = <STDIN>)) {
42    ($v, $l) = split(/;/, $line, 2);
43    if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) {
44        # This isn't actually the format... fix that if it ever matters
45        $r1 = hex $1;
46        $r2 = hex $2;
47    } elsif ($v =~ /^([0-9a-f]+)$/i) {
48        $r1 = $r2 = hex $1;
49    } else {
50        next;
51    }
52    for ($r = $r1; $r <= $r2; $r++) {
53        printf "%04X;%s", $r, $l if ($need_these{$r});
54    }
55}
56
57       
Note: See TracBrowser for help on using the repository browser.