pdf2index 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. #!/usr/bin/perl -- # -*- Perl -*-
  2. # this needs some cleanup...
  3. my $PSTOTEXT = "pstotext";
  4. my $pdf = shift @ARGV;
  5. my $index = "";
  6. my $inindex = 0;
  7. open (F, "$PSTOTEXT $pdf |");
  8. while (<F>) {
  9. if (/^<\/index/) {
  10. $index .= $_;
  11. $inindex = 0;
  12. }
  13. $inindex = 1 if /^<index/;
  14. if ($inindex) {
  15. $index .= $_ if /^\s*</;
  16. }
  17. }
  18. my $cindex = "";
  19. while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) {
  20. $cindex .= $1;
  21. $_ = $2;
  22. $index = $'; # '
  23. my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg;
  24. # Expand ranges
  25. if ($#pages >= 0) {
  26. my @mpages = ();
  27. foreach my $page (@pages) {
  28. my $pageno = &pageno($page);
  29. if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -
  30. for (my $count = $1; $count <= $2; $count++) {
  31. push (@mpages, "<phrase role=\"$pageno\">$count</phrase>");
  32. }
  33. } else {
  34. push (@mpages, $page);
  35. }
  36. }
  37. @pages = sort rangesort @mpages;
  38. }
  39. # Remove duplicates...
  40. if ($#pages > 0) {
  41. my @mpages = ();
  42. my $current = "";
  43. foreach my $page (@pages) {
  44. my $pageno = &pageno($page);
  45. if ($pageno ne $current) {
  46. push (@mpages, $page);
  47. $current = $pageno;
  48. }
  49. }
  50. @pages = @mpages;
  51. }
  52. # Collapse ranges...
  53. if ($#pages > 1) {
  54. my @cpages = ();
  55. while (@pages) {
  56. my $count = 0;
  57. my $len = &rangelen($count, @pages);
  58. if ($len <= 2) {
  59. my $page = shift @pages;
  60. push (@cpages, $page);
  61. } else {
  62. my $fpage = shift @pages;
  63. my $lpage = "";
  64. while ($len > 1) {
  65. $lpage = shift @pages;
  66. $len--;
  67. }
  68. my $fpno = &pageno($fpage);
  69. my $lpno = &pageno($lpage);
  70. $fpage =~ s/>$fpno</>${fpno}-$lpno</s;
  71. push (@cpages, $fpage);
  72. }
  73. }
  74. @pages = @cpages;
  75. }
  76. my $page = shift @pages;
  77. $page =~ s/\s*$//s;
  78. $cindex .= $page;
  79. while (@pages) {
  80. $page = shift @pages;
  81. $page =~ s/\s*$//s;
  82. $cindex .= ", $page";
  83. }
  84. }
  85. $cindex .= $index;
  86. print "$cindex\n";
  87. sub pageno {
  88. my $page = shift;
  89. $page =~ s/^<phrase.*?>//;
  90. $page =~ s/^<link.*?>//;
  91. return $1 if $page =~ /^([^<>]+)/;
  92. return "?";
  93. }
  94. sub rangesort {
  95. my $apno = &pageno($a);
  96. my $bpno = &pageno($b);
  97. # Make sure roman pages come before arabic ones, otherwise sort them in order
  98. return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
  99. return 1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
  100. return $apno <=> $bpno;
  101. }
  102. sub rangelen {
  103. my $count = shift;
  104. my @pages = @_;
  105. my $len = 1;
  106. my $inrange = 1;
  107. my $current = &pageno($pages[$count]);
  108. while ($count < $#pages && $inrange) {
  109. $count++;
  110. my $next = &pageno($pages[$count]);
  111. if ($current + 1 eq $next) {
  112. $current = $next;
  113. $inrange = 1;
  114. $len++;
  115. } else {
  116. $inrange = 0;
  117. }
  118. }
  119. return $len;
  120. }