| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 | 
							- #!/usr/bin/perl -- # -*- Perl -*-
 
- # this needs some cleanup...
 
- my $PSTOTEXT = "pstotext";
 
- my $pdf = shift @ARGV;
 
- my $index = "";
 
- my $inindex = 0;
 
- open (F, "$PSTOTEXT $pdf |");
 
- while (<F>) {
 
-     if (/^<\/index/) {
 
- 	$index .= $_;
 
- 	$inindex = 0;
 
-     }
 
-     $inindex = 1 if /^<index/;
 
-     if ($inindex) {
 
- 	$index .= $_ if /^\s*</;
 
-     }
 
- }
 
- my $cindex = "";
 
- while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) {
 
-     $cindex .= $1;
 
-     $_ = $2;
 
-     $index = $'; # '
 
-     my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg;
 
-     # Expand ranges
 
-     if ($#pages >= 0) {
 
- 	my @mpages = ();
 
- 	foreach my $page (@pages) {
 
- 	    my $pageno = &pageno($page);
 
- 	    if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -
 
- 		for (my $count = $1; $count <= $2; $count++) {
 
- 		    push (@mpages, "<phrase role=\"$pageno\">$count</phrase>");
 
- 		}
 
- 	    } else {
 
- 		push (@mpages, $page);
 
- 	    }
 
- 	}
 
- 	@pages = sort rangesort @mpages;
 
-     }
 
-     # Remove duplicates...
 
-     if ($#pages > 0) {
 
- 	my @mpages = ();
 
- 	my $current = "";
 
- 	foreach my $page (@pages) {
 
- 	    my $pageno = &pageno($page);
 
- 	    if ($pageno ne $current) {
 
- 		push (@mpages, $page);
 
- 		$current = $pageno;
 
- 	    }
 
- 	}
 
- 	@pages = @mpages;
 
-     }
 
-     # Collapse ranges...
 
-     if ($#pages > 1) {
 
- 	my @cpages = ();
 
- 	while (@pages) {
 
- 	    my $count = 0;
 
- 	    my $len = &rangelen($count, @pages);
 
- 	    if ($len <= 2) {
 
- 		my $page = shift @pages;
 
- 		push (@cpages, $page);
 
- 	    } else {
 
- 		my $fpage = shift @pages;
 
- 		my $lpage = "";
 
- 		while ($len > 1) {
 
- 		    $lpage = shift @pages;
 
- 		    $len--;
 
- 		}
 
- 		my $fpno = &pageno($fpage);
 
- 		my $lpno = &pageno($lpage);
 
- 		$fpage =~ s/>$fpno</>${fpno}-$lpno</s;
 
- 		push (@cpages, $fpage);
 
- 	    }
 
- 	}
 
- 	@pages = @cpages;
 
-     }
 
-     my $page = shift @pages;
 
-     $page =~ s/\s*$//s;
 
-     $cindex .= $page;
 
-     while (@pages) {
 
- 	$page = shift @pages;
 
- 	$page =~ s/\s*$//s;
 
- 	$cindex .= ", $page";
 
-     }
 
- }
 
- $cindex .= $index;
 
- print "$cindex\n";
 
- sub pageno {
 
-     my $page = shift;
 
-     $page =~ s/^<phrase.*?>//;
 
-     $page =~ s/^<link.*?>//;
 
-     return $1 if $page =~ /^([^<>]+)/;
 
-     return "?";
 
- }
 
- sub rangesort {
 
-     my $apno = &pageno($a);
 
-     my $bpno = &pageno($b);
 
-     # Make sure roman pages come before arabic ones, otherwise sort them in order
 
-     return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
 
-     return  1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
 
-     return $apno <=> $bpno;
 
- }
 
- sub rangelen {
 
-     my $count = shift;
 
-     my @pages = @_;
 
-     my $len = 1;
 
-     my $inrange = 1;
 
-     my $current = &pageno($pages[$count]);
 
-     while ($count < $#pages && $inrange) {
 
- 	$count++;
 
- 	my $next = &pageno($pages[$count]);
 
- 	if ($current + 1 eq $next) {
 
- 	    $current = $next;
 
- 	    $inrange = 1;
 
- 	    $len++;
 
- 	} else {
 
- 	    $inrange = 0;
 
- 	}
 
-     }
 
-     return $len;
 
- }
 
 
  |