Context Navigation

gensubset.pl @ dd1be7c

Last change on this file since dd1be7c was e16e8f2, checked in by Edwin Eefting <edwin@datux.nl>, 3 years ago
bootstuff
Property mode set to `100755`
File size: 1.4 KB

Line
1	#!/usr/bin/perl
2	#
3	# Generate a subset of the UnicodeData.txt file, available from
4	# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
5	#
6	# Usage:
7	# gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt
8	#
9
10	%need_these = ();
11
12	# Mark as needed all the characters mentioned in the relevant files
13	foreach $file (@ARGV) {
14	open(F, '<', $file) or die;
15	while (defined($line = <F>)) {
16	$line =~ s/\s(\#.\|)$//; # Remove comments and final blanks
17	@f = split(/\s+/, $line);
18	next if (scalar @f != 2);
19	$need_these{hex $f[1]}++;
20	}
21	close(F);
22	}
23
24	# Also mark as needed any case variants of those
25	# (Note: this doesn't necessarily provide the full transitive closure,
26	# but we shouldn't need it.)
27	while (defined($line = <STDIN>)) {
28	@f = split(/;/, $line);
29	if ($f[0] =~ /^([0-9a-f]+)$/i) {
30	$r = hex $f[0];
31	if ($need_these{$r}) {
32	$need_these{hex $f[12]}++ if ($f[12] ne '');
33	$need_these{hex $f[13]}++ if ($f[13] ne '');
34	$need_these{hex $f[14]}++ if ($f[14] ne '');
35	}
36	}
37	}
38
39	# Finally, write out the subset
40	seek(STDIN, 0, 0);
41	while (defined($line = <STDIN>)) {
42	($v, $l) = split(/;/, $line, 2);
43	if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) {
44	# This isn't actually the format... fix that if it ever matters
45	$r1 = hex $1;
46	$r2 = hex $2;
47	} elsif ($v =~ /^([0-9a-f]+)$/i) {
48	$r1 = $r2 = hex $1;
49	} else {
50	next;
51	}
52	for ($r = $r1; $r <= $r2; $r++) {
53	printf "%04X;%s", $r, $l if ($need_these{$r});
54	}
55	}
56
57

Note: See TracBrowser for help on using the repository browser.