# ThMLutil -- utilities for ThML file processing # use strict; # #given a div start tag, this subroutine returns the id, title, #and filename for the division. Dies if the division had no ID. # #Title chosen is the type and n if both are present (e.g. Chapter 3), #followed by the title attribute if present. If neither of those were #present, the ID is used. # sub getName { my $div = shift; my $title = ""; # print STDERR "Getting title from division $div\n"; die "A division had no ID: $div\n" unless $div =~ m|id="([^"]*)"|; my $id = $1; my ($n) = $div =~ m|n="([^"]*?)"|; my ($type) = $div =~ m|type="([^"]*?)"|s; $title = "$type $n. " if $type and $n; my ($titleatt) = $div =~ m|title="([^"]*?)"|s; if ($titleatt) { $title .= $titleatt if $title; $title = $titleatt unless $title; } $title ||= $id; my $filename = "$id.htm"; # print STDERR " returning title $title\n"; return ($id, $title, $filename); } # # delete all tags in parameter # sub detag { my $in = shift @_; $in =~ s|<[^>]*>||gs; return $in; } # change the tag for a character style to something else: # _delete --> delete start tag, end tag, and text between # _unescape_delete --> unescape and delete tags # _detag --> remove tags sub chartag { my $styname = shift; my $tagname = shift; my $tagend = shift; if ($tagname eq "_unescape_detag") { s|(.*?)|&unescape($1)|gsei; } elsif ($tagname eq "_delete") { s|.*?||gsi; } elsif ($tagname eq "_detag") { s|(.*?)|$1|gsi; } elsif ($tagname eq "_comment") { s|(.*?)||gsi; } else { s|(.*?)|<$tagname$1>$2|gsi; } } # handle a paragraph style: # _unescape_detag: unescape and remove tags # _detag: remove tags # _delete: delete tags and text # otherwise, switch to a new tag name sub partag { my $styname = shift; my $tagname = shift; my $tagend = shift; if ($tagname eq "_unescape_detag") { s|

(.*?)

|&unescape($1)|gsie; } elsif ($tagname eq "_delete") { s|

.*?

||gsi; } elsif ($tagname eq "_detag") { s|

(.*?)

|$1|gsi; } elsif ($tagend eq "") { s|

(.*?)

|<$tagname$1>$2|gsi; } else { s|

(.*?)

|<$tagname$1>$2|gsi; } } # change one tag to another sub retag { my $tagname = shift; my $tagrep = shift; my $tagend = shift; if ($tagrep eq "_detag") { s|||gsi; } elsif ($tagrep eq "_delete") { s|<$tagname\b.*?>.*?\s*||gsi; s|<$tagname\b.*?>\s*||gsi; } elsif ($tagrep eq "_delete_attributes") { s|(<$tagname\b).*?(/?>)|$1$2|gi; } else { s|<$tagname\b(.*?)>|<$tagrep$1>|gsi; s|||gsi; } } #escape notes so they don't get processed later sub escape { my $stuff = shift; $stuff =~ s/, & to & sub unescape { my $stuff = shift; $stuff =~ s/(\<)(.*?)(\>)/$1.&dumbquo($2).$3/gse; $stuff =~ s/\<//g; $stuff =~ s/\&/\&/g; return $stuff; } # #unsmarten (dumbify?) quotes and apostrophes because #netscape is unsmart enough not to understand them. # sub dumbquo { my $stuff = shift; $stuff =~ s/\&[rl]dquo;/"/g; $stuff =~ s/\&[rl]squo;/'/g; $stuff =~ s/\'/'/g; $stuff =~ s/\—/--/g; $stuff =~ s/\–/-/g; $stuff =~ s/\&\#8209;/-/g; $stuff =~ s/\&\#8212;/--/g; $stuff =~ s/\&\#8211;/-/g; $stuff =~ s/\&\#821[67];/'/g; $stuff =~ s/\&\#822[01];/"/g; return $stuff; } # convert a hex unicode code to a decimal escape sub escapehex { my $code = shift; my $dec = hex $code; return "&#$dec;"; } #convert greek (charset=161) to decimal unicode escapes sub unigreek { my $gk=shift; # print STDERR "Greek passage: $gk\n"; $gk =~ s|(\d+)|$1+720|ge; # print STDERR "...converted to $gk\n"; return $gk; } #---------------------------------------------------------- # # Identify subroutines -- add ids to many tags. # This code can be run more than once on a file without # ill effect -- and it has been greatly speeded up, so # it can be called once in index and once in thm2htm. # # ncrease code is also here, to add n= attributes to # tags, making each one more than the previous. # # bugs: if a has n attributes that can't be incremented, e.g. # n="Two", results are undesirable. n attributes should be arabic or # roman numerals. # my %ids_used; #----------------identify: add IDs--------------------------- # sub identify { my $in = shift; print "In identify\n"; # change n attribute of PB tags to Page_n id $in =~ s|(]*n=")([^"]*)("[^>]*?)\s*(/?>)|$1$2$3 id="Page_$2"$4|gs; # change all existing n="xx" to upper case, so we can add lower case roman # numerals later on. $in =~ s|(]*n=")([^"]*)(")|$1 . uc($2) . $3|gsie; # find all ids already in use while ($in =~ m|id="([^"]*)"|g) { $ids_used{$1} = 1; } # find head, body, tail $in =~ m|^(.*)(.*)(.*)$|is; my ($head, $div0, $tail) = ($1, $2, $3); return $head . &identifyDiv($div0,"0","") . $tail; } #---------------process a division -- add ids --------------- # # identifyDiv($div, $level, $prefix). A divn consists of some cdata # followed by 0 or more div(n+1)s. # --> make sure all divs have n attributes as needed # --> keep track of where we are (3.1.5, etc.). This is $prefix # --> process embedded divs recursively # This function does not receive only the contents of a div$level,not # the surrounding tags. # sub identifyDiv { my ($div, $level, $pre) = @_; my $hasdivs = $div =~ m|]*>)(.*?)()||si) { my ($stuff, $start, $newlevel, $content, $end) = ($1, $2, $3, $4, $5); $result .= $stuff; my $ntag = $start =~ m|n="([^"]*)"|; $oldn = $n; $n = $1 if $ntag; $pre2 = "$pre.$n" if $pre; $pre2 = $n unless $pre; # $start =~ s|\s*>| n="$n">|s if !$ntag and $ncrease; $n = &inc($n); $oldn = &inc($oldn); $n = $oldn unless $n; $start =~ s|\s*id="([^"]*)"||; $start =~ s|\s*>| id="$pre2">|s; $content = &identifyDiv($content, $newlevel, $pre2); $result .= "$start$content$end"; } return $result; } # # increment a number whether arabic or roman # sub inc { my $n = shift; if ($n =~ m|^[mdclxvi]*$|i) { $n = &incroman($n); } elsif ($n =~ m|^[0-9]+$|) { $n++; } else { $n = ""; } return $n; } # # increment lower- or upper-case roman numerals. Add one and carry. # Doesn't know what to do with MMMM, MMMMM, etc. # sub incroman { my $r = shift; my $one = "i"; $one = "I" if $r =~ m/^[MDCLXVI]*$/; $r .= $one;; # now perform carries $r =~ s|iiii|iv|; $r =~ s|IIII|IV|; $r =~ s|ivi|v|; $r =~ s|IVI|V|; $r =~ s|viv|ix|; $r =~ s|VIV|IX|; $r =~ s|ixi|x|; $r =~ s|IXI|X|; $r =~ s|xxxx|xl|; $r =~ s|XXXX|XL|; $r =~ s|xlx|l|; $r =~ s|XLX|L|; $r =~ s|lxl|xc|; $r =~ s|LXL|XC|; $r =~ s|xcx|c|; $r =~ s|XCX|C|; $r =~ s|cccc|cd|; $r =~ s|CCCC|CD|; $r =~ s|cdc|d|; $r =~ s|CDC|D|; $r =~ s|dcd|cm|; $r =~ s|DCD|CM|; $r =~ s|cmc|m|; $r =~ s|CMC|M|; return $r; } #------------ add tag ids -------------- # # This subroutine adds an id attribute to each element after the # first . The id is of the form xxx.xxx.xxx.pyy.zz, where # each xxx identifies a div, yy is the paragraph number in the # division, and zz is the element number. # sub addid { my ($in,$pre) = @_; print STDERR "addid input length:" . length($in); my ($p,$t,$out,$tag,$id) = ("0","0","","",""); while ($in =~ s|^([^<]*)(<[^>]*>)||s) { $out .= $1; $tag = $2; if (($tag =~ m/^(|| $t++; #increment tag counter $id = "$pre.p$p.$t"; if ($ids_used{$id}) { print STDERR "HEY -- $id was already used!\n" if $ids_used{$id}; my $idsuffix = "_1"; while ($ids_used{$id . $idsuffix}) { $idsuffix++; } $id .= $idsuffix; print STDERR " -- let's use $id instead.\n"; } } $tag =~ s|\s*(/?>)| id="$id"$1|si; $out .= $tag; $ids_used{$id} = 1; } # print STDERR "-->" . length($out) . "\n"; return $out; } #---------------------------------------------------------------- # ncrease tags sub ncrease { $_ = shift; #first determine the number of pages per image, if there are images. my $col = 0; my $output = ""; my ($pb, $oldhref, $foundhref, $newn, $oldn); my $href = ""; # now loop through $_, stopping to process each element. while (m|^(.*?)()|s) { s|^(.*?)()||s; $output .= $1; my $pb = $2; # print STDERR "$pb --> "; # #this section handles hrefs. #If there is not an href, but there was one previously, inc previous one. #If there is an href, remember it, and if it is the same as previous one, # set $col to 2. (2 pages per image; this is second column) # $oldhref = $href; $foundhref = ""; $foundhref = $1 if $pb =~ m|href="(.*?)"|; if (!$foundhref && $oldhref) { # print STDERR "noref oldref"; ($href, $col) = incpage($oldhref, $col); $pb =~ s|/>| href="$href" />|; } elsif (length($foundhref)>0) { # print STDERR "href"; $href=$foundhref; $col=2 if $href eq $oldhref; } # print STDERR " col=$col "; #this section increments n, the page number my $n = ""; $n = $1 if $pb =~ m|n="(.*?)"|; if ($n) { $newn = $n; } else { $newn = &inc($oldn); } $oldn = $newn; $pb =~ s|/>| n="$newn" />| unless $pb =~ m|n=|; #increment href, the page image # print STDERR "$pb\n"; $output .= $pb; } $output .= $_; return $output; } # smart-increment a picture href # # Two sequences are supported: xx001.gif, xx002.gif, etc and xx001a.gif, # xx001b.gif, xx002a.gif, xx002b.gif, etc. In addition, there may be one # or two pages per image. # # This subroutine takes a filename and $col=0, 1, or 2. If 0, there is # one page per image. If 1, there are two pages per image, and this is # the first. If two, there are two pages per image, and this is the # second. # # The return values are the new $filename and the new $col value. # sub incpage { my $filename = shift; my $col = shift; return ($filename, 2) if $col==1; #if there's another page on this pic, use it. $col=1 if $col==2; if ($filename =~ m|a\.[^\.]+$|) { $filename =~ s|a(\.[^\.]+$)|b$1|; } else { $filename =~ s|b(\.[^\.]+$)|a$1|; $filename =~ m|([0-9]+)[^0-9]*$|; my $oldn = $1; my $newn = $1 + 1; while ( length($newn) < length($oldn) ) {$newn = "0" . $newn; } $filename =~ s|$oldn|$newn|; } return ($filename, $col); } sub divcrease { my $in = shift; my ($out, $oldn, $n, $div, $level, $oldlevel) = ("","","","","",""); while ($in =~ m|^(.*?)()|s) { $in =~ s|^(.*?)(]*>)||s; $out .= $1; $div = $2; $oldn = $n; $n = ""; $n = $1 if $div =~ m|n="(.*?)"|; $oldlevel = $level; $level = ""; $level = $1 if $div =~ m||$1 n="$n" >|s if $n; # print "ncrease: div=$div oldn=$oldn n=$n ndiv=$div\n"; } $out .= $div; } $out .= $in; return $out; } 1;