# ThMLutil -- utilities for ThML file processing
#
use strict;


#
#given a div start tag, this subroutine returns the id, title, 
#and filename for the division.  Dies if the division had no ID.
#
#Title chosen is the type and n if both are present (e.g. Chapter 3),
#followed by the title attribute if present.  If neither of those were
#present, the ID is used.
#
sub getName
{
  my $div = shift;
  my $title = "";
# print STDERR "Getting title from division $div\n";
  die "A division had no ID: $div\n" unless $div =~ m|id="([^"]*)"|;
  my $id = $1;
  my ($n) = $div =~ m|n="([^"]*?)"|;
  my ($type) = $div =~ m|type="([^"]*?)"|s;
  $title = "$type $n. " if $type and $n; 
  my ($titleatt) = $div =~ m|title="([^"]*?)"|s;
  if ($titleatt) {
    $title .= $titleatt if $title;
    $title = $titleatt unless $title;
  }
  $title ||= $id;
  my $filename = "$id.htm";
# print STDERR "  returning title $title\n";
  return ($id, $title, $filename);
}
 

#
# delete all tags in parameter
#
sub detag
{
  my $in = shift @_;
  $in =~ s|<[^>]*>||gs;
  return $in;
}


# change the tag for a character style to something else:
# _delete --> delete start tag, end tag, and text between
# _unescape_delete --> unescape and delete tags
# _detag --> remove tags

sub chartag
{
  my $styname = shift;
  my $tagname = shift;
  my $tagend  = shift;

  if ($tagname eq "_unescape_detag") {
    s|<STRING CHARSTYNAME="$styname".*?>(.*?)</STRING>|&unescape($1)|gsei; }
  elsif ($tagname eq "_delete") {
    s|<STRING CHARSTYNAME="$styname".*?>.*?</STRING>||gsi; }
  elsif ($tagname eq "_detag") {
    s|<STRING CHARSTYNAME="$styname".*?>(.*?)</STRING>|$1|gsi; }
  elsif ($tagname eq "_comment") {
    s|<STRING CHARSTYNAME="$styname".*?>(.*?)</STRING>|<!-- $1 -->|gsi; }
  else {
    s|<STRING CHARSTYNAME="$styname"(.*?)>(.*?)</STRING>|<$tagname$1>$2</$tagend>|gsi; }
}


# handle a paragraph style:
# _unescape_detag: unescape and remove tags
# _detag: remove tags
# _delete: delete tags and text
# otherwise, switch to a new tag name

sub partag
{
  my $styname = shift;
  my $tagname = shift;
  my $tagend  = shift;

  if ($tagname eq "_unescape_detag") {
    s|<P STYLENAME="$styname".*?>(.*?)</P>|&unescape($1)|gsie; }
  elsif ($tagname eq "_delete") {
    s|<P STYLENAME="$styname".*?>.*?</P>||gsi; }
  elsif ($tagname eq "_detag") {
    s|<P STYLENAME="$styname".*?>(.*?)</P>|$1|gsi; }
  elsif ($tagend eq "") {
    s|<P STYLENAME="$styname"(.*?)>(.*?)</P>|<$tagname$1>$2|gsi; }
  else {
    s|<P STYLENAME="$styname"(.*?)>(.*?)</P>|<$tagname$1>$2</$tagend>|gsi; }
}


# change one tag to another
sub retag
{
  my $tagname = shift;
  my $tagrep  = shift;
  my $tagend  = shift;

  if ($tagrep eq "_detag")
    { s|</?$tagname\b.*?>||gsi; }
  elsif ($tagrep eq "_delete")
  {
    s|<$tagname\b.*?>.*?</$tagname>\s*||gsi;
    s|<$tagname\b.*?>\s*||gsi;
  }
  elsif ($tagrep eq "_delete_attributes")
  {
    s|(<$tagname\b).*?(/?>)|$1$2|gi;
  }
  else
  {
    s|<$tagname\b(.*?)>|<$tagrep$1>|gsi;
    s|</$tagname\b(.*?)>|</$tagend$1>|gsi;
  }
}


#escape notes so they don't get processed later
sub escape 
{
  my $stuff = shift;
  $stuff =~ s/</\&less-than;/g;
  return $stuff;
}


#change &lt; to <, &gt; to >, &amp; to &
sub unescape
{
  my $stuff = shift;
  $stuff =~ s/(\&lt;)(.*?)(\&gt;)/$1.&dumbquo($2).$3/gse;
  $stuff =~ s/\&lt;/</g;
  $stuff =~ s/\&gt;/>/g;
  $stuff =~ s/\&amp;/\&/g;
  return $stuff;
}


#
#unsmarten (dumbify?) quotes and apostrophes because
#netscape is unsmart enough not to understand them.
#
sub dumbquo
{
  my $stuff = shift;
  $stuff =~ s/\&[rl]dquo;/"/g;
  $stuff =~ s/\&[rl]squo;/'/g;
  $stuff =~ s/\&apos;/'/g;
  $stuff =~ s/\&mdash;/--/g;
  $stuff =~ s/\&ndash;/-/g;
  $stuff =~ s/\&\#8209;/-/g;
  $stuff =~ s/\&\#8212;/--/g;
  $stuff =~ s/\&\#8211;/-/g;
  $stuff =~ s/\&\#821[67];/'/g;
  $stuff =~ s/\&\#822[01];/"/g;
  return $stuff;
}


# convert a hex unicode code to a decimal escape
sub escapehex
{
  my $code = shift;
  my $dec = hex $code;
  return "&#$dec;";
}


#convert greek (charset=161) to decimal unicode escapes
sub unigreek
{
  my $gk=shift;
# print STDERR "Greek passage: $gk\n";
  $gk =~ s|(\d+)|$1+720|ge;
# print STDERR "...converted to $gk\n";
  return $gk;
}


#----------------------------------------------------------
#
# Identify subroutines -- add ids to many tags.
# This code can be run more than once on a file without
# ill effect -- and it has been greatly speeded up, so
# it can be called once in index and once in thm2htm.
#
# ncrease code is also here, to add n= attributes to
# <divn> tags, making each one more than the previous.
#
# bugs: if a <divn> has n attributes that can't be incremented, e.g.
# n="Two", results are undesirable. n attributes should be arabic or
# roman numerals.
#

my %ids_used;

#----------------identify: add IDs---------------------------
#
sub identify 
{
my $in = shift;
print "In identify\n";

# change n attribute of PB tags to Page_n id
$in =~ s|(<pb[^>]*n=")([^"]*)("[^>]*?)\s*(/?>)|$1$2$3 id="Page_$2"$4|gs;

# change all existing n="xx" to upper case, so we can add lower case roman
# numerals later on.
$in =~ s|(<div[^>]*n=")([^"]*)(")|$1 . uc($2) . $3|gsie;

# find all ids already in use
while ($in =~ m|id="([^"]*)"|g) { $ids_used{$1} = 1; }

# find head, body, tail
$in =~ m|^(.*<ThML.body>)(.*)(</ThML.body>.*)$|is;
my ($head, $div0, $tail) = ($1, $2, $3);

return $head . &identifyDiv($div0,"0","") . $tail;
}


#---------------process a division -- add ids ---------------
#
# identifyDiv($div, $level, $prefix).  A divn consists of some cdata 
# followed by 0 or more div(n+1)s.
# --> make sure all divs have n attributes as needed
# --> keep track of where we are (3.1.5, etc.). This is $prefix
# --> process embedded divs recursively
# This function does not receive only the contents of a div$level,not
# the surrounding tags.
#
sub identifyDiv
{
  my ($div, $level, $pre) = @_;
  my $hasdivs = $div =~ m|<div\d|;
  print "In identifyDiv\n";
  return &addid($div,$pre) unless $hasdivs;

  $div =~ s|(^.*?)(<div\d)|$2|si;
  my $result = &addid($1,$pre);

  my ($n,$oldn,$id,$pre2) = ("i","","");
  while ($div =~ s|(.*?)(<div(\d)[^>]*>)(.*?)(</div\3>)||si) {
    my ($stuff, $start, $newlevel, $content, $end) = ($1, $2, $3, $4, $5);
    $result .= $stuff;
    my $ntag = $start =~ m|n="([^"]*)"|;
    $oldn = $n;
    $n = $1 if $ntag;
    $pre2 = "$pre.$n" if $pre;
    $pre2 = $n unless $pre;
#   $start =~ s|\s*>| n="$n">|s if !$ntag and $ncrease;
    $n = &inc($n);
    $oldn = &inc($oldn);
    $n = $oldn unless $n;

    $start =~ s|\s*id="([^"]*)"||;
    $start =~ s|\s*>| id="$pre2">|s;

    $content = &identifyDiv($content, $newlevel, $pre2);
    $result .= "$start$content$end";
  }
  return $result;
}

  
#
# increment a number whether arabic or roman
#
sub inc				
{
  my $n = shift;

  if ($n =~ m|^[mdclxvi]*$|i)
    { $n = &incroman($n); }
  elsif ($n =~ m|^[0-9]+$|)
    { $n++; }
  else
    { $n = ""; }

  return $n; 
} 


#
# increment lower- or upper-case roman numerals. Add one and carry.
# Doesn't know what to do with MMMM, MMMMM, etc.
#
sub incroman	
{
  my $r = shift;
  my $one = "i";
  $one = "I" if $r =~ m/^[MDCLXVI]*$/;
  $r .= $one;;

  # now perform carries
  $r =~ s|iiii|iv|;
  $r =~ s|IIII|IV|;
  $r =~ s|ivi|v|;
  $r =~ s|IVI|V|;
  $r =~ s|viv|ix|;
  $r =~ s|VIV|IX|;
  $r =~ s|ixi|x|;
  $r =~ s|IXI|X|;
  $r =~ s|xxxx|xl|;
  $r =~ s|XXXX|XL|;
  $r =~ s|xlx|l|;
  $r =~ s|XLX|L|;
  $r =~ s|lxl|xc|;
  $r =~ s|LXL|XC|;
  $r =~ s|xcx|c|;
  $r =~ s|XCX|C|;
  $r =~ s|cccc|cd|;
  $r =~ s|CCCC|CD|;
  $r =~ s|cdc|d|;
  $r =~ s|CDC|D|;
  $r =~ s|dcd|cm|;
  $r =~ s|DCD|CM|;
  $r =~ s|cmc|m|;
  $r =~ s|CMC|M|;

  return $r;
}


#------------ add tag ids --------------
# 
# This subroutine adds an id attribute to each element after the
# first <div1>. The id is of the form xxx.xxx.xxx.pyy.zz, where
# each xxx identifies a div, yy is the paragraph number in the
# division, and zz is the element number.
#


sub addid
{
  my ($in,$pre) = @_;
  print STDERR "addid input length:" . length($in);
  my ($p,$t,$out,$tag,$id) = ("0","0","","","");

  while ($in =~ s|^([^<]*)(<[^>]*>)||s) {
    $out .= $1;
    $tag = $2;

    if (($tag =~ m/^(<i>|<b>|<br|<hr|<sup|<note|<span|<!)/) || #unwanted tags
        ($tag =~ m|^</|) ||		#end tag
        ($tag =~ m|id="|)) {		#tag already had an ID
        $out .= $tag;
        next;
    }

    if ($tag =~ m|(<p[^>]*)>|s)
    {
      $p++;				#increment paragraph counter
      $t = "0";				#reset tag counter to 0
      $id = "$pre.p$p";
      if ($ids_used{$id}) {
        print STDERR "HEY -- $id was already used! (2)\n" if $ids_used{$id};
        my $idsuffix = "_1";
        while ($ids_used{$id . $idsuffix}) { $idsuffix++; }
        $id .= $idsuffix;
        print STDERR "    -- let's use $id instead.\n";
      }
    } else { 				#any other tag, possibly <  />
      $t++;				#increment tag counter
      $id = "$pre.p$p.$t";
      if ($ids_used{$id}) {
        print STDERR "HEY -- $id was already used!\n" if $ids_used{$id};
        my $idsuffix = "_1";
        while ($ids_used{$id . $idsuffix}) { $idsuffix++; }
        $id .= $idsuffix;
        print STDERR "    -- let's use $id instead.\n";
      }
    }
    $tag =~ s|\s*(/?>)| id="$id"$1|si;
    $out .= $tag;
    $ids_used{$id} = 1;
  }

# print STDERR "-->" . length($out) . "\n";
  return $out;
}


#----------------------------------------------------------------
# ncrease <pb> tags

sub ncrease
{
$_ = shift;
#first determine the number of pages per image, if there are images.
my $col = 0;
my $output = "";
my ($pb, $oldhref, $foundhref, $newn, $oldn);
my $href = "";

# now loop through $_, stopping to process each <pb...> element.
while (m|^(.*?)(<pb.*?/>)|s)
  {
  s|^(.*?)(<pb.*?/>)||s;
  $output .= $1;
  my $pb = $2;
# print STDERR "$pb --> ";

  #
  #this section handles hrefs. 
  #If there is not an href, but there was one previously, inc previous one.
  #If there is an href, remember it, and if it is the same as previous one, 
  # set $col to 2. (2 pages per image; this is second column)
  #
  $oldhref = $href;		
  $foundhref = "";
  $foundhref = $1 if $pb =~ m|href="(.*?)"|;
  if (!$foundhref && $oldhref)
    { 
#   print STDERR "noref oldref";
    ($href, $col) = incpage($oldhref, $col); 
    $pb =~ s|/>| href="$href" />|;
    }
  elsif (length($foundhref)>0)
    {
#   print STDERR "href";
    $href=$foundhref;
    $col=2 if $href eq $oldhref;
    }
# print STDERR " col=$col ";

  #this section increments n, the page number
  my $n = "";
  $n = $1 if $pb =~ m|n="(.*?)"|;
  if ($n)
    { $newn = $n; }
  else
    { $newn = &inc($oldn); }
  $oldn = $newn;
  $pb =~ s|/>| n="$newn" />| unless $pb =~ m|n=|;

  #increment href, the page image

# print STDERR "$pb\n";
  $output .= $pb;
  }
  $output .= $_;
  return $output;
}


#  smart-increment a picture href
#
#  Two sequences are supported: xx001.gif, xx002.gif, etc and xx001a.gif, 
#  xx001b.gif, xx002a.gif, xx002b.gif, etc. In addition, there may be one
#  or two pages per image.
#
#  This subroutine takes a filename and $col=0, 1, or 2.  If 0, there is
#  one page per image.  If 1, there are two pages per image, and this is
#  the first.  If two, there are two pages per image, and this is the
#  second.
#
#  The return values are the new $filename and the new $col value.
#
sub incpage	
{
  my $filename = shift;
  my $col = shift;

  return ($filename, 2) if $col==1; 
				#if there's another page on this pic, use it.

  $col=1 if $col==2;
  if ($filename =~ m|a\.[^\.]+$|)
    { $filename =~ s|a(\.[^\.]+$)|b$1|; }
  else
    {
    $filename =~ s|b(\.[^\.]+$)|a$1|;

    $filename =~ m|([0-9]+)[^0-9]*$|;
    my $oldn = $1;
    my $newn = $1 + 1;
    while ( length($newn) < length($oldn) )
      {$newn = "0" . $newn; }

    $filename =~ s|$oldn|$newn|;
    }

  return ($filename, $col);
}


sub divcrease
{
  my $in = shift;
  my ($out, $oldn, $n, $div, $level, $oldlevel) = ("","","","","","");

  while ($in =~ m|^(.*?)(<div\d.*?>)|s)
    {
    $in =~ s|^(.*?)(<div\d[^>]*>)||s;
    $out .= $1;
    $div = $2;
    $oldn = $n;
    $n = "";
    $n = $1 if $div =~ m|n="(.*?)"|;
    $oldlevel = $level;
    $level = "";
    $level = $1 if $div =~ m|<div(.)|;

#   print " oldn=$oldn n=$n   oldlevel=$oldlevel level=$level\n";
    if ($level eq $oldlevel and $oldn and !$n)
    {
      $n = &inc($oldn);
      $div =~ s|(<div.*?)>|$1 n="$n" >|s if $n;
#     print "ncrease: div=$div oldn=$oldn n=$n ndiv=$div\n";
    }
    $out .= $div;
  }

  $out .= $in;
  return $out;
}


1;
