# ThMLutil -- utilities for ThML file processing
#
use strict;


#
#given a div start tag, this subroutine returns the id, title, 
#and filename for the division.  Dies if the division had no ID.
#
#Title chosen is the type and n if both are present (e.g. Chapter 3),
#followed by the title attribute if present.  If neither of those were
#present, the ID is used.
#
sub getName
{
  my $div = shift;
  my $title = "";
# print STDERR "Getting title from division $div\n";
  die "A division had no ID: $div\n" unless $div =~ m|id="(.*?)"|;
  my $id = $1;
  my ($n) = $div =~ m|n="(.*?)"|;
  my ($type) = $div =~ m|type="(.*?)"|;
  $title = "$type $n. " if $type and $n; 
  my ($titleatt) = $div =~ m|title="(.*?)"|;
  if ($titleatt) {
    $title .= $titleatt if $title;
    $title = $titleatt unless $title;
  }
  $title ||= $id;
  my $filename = "$id.htm";
# print STDERR "  returning title $title\n";
  return ($id, $title, $filename);
}
 

#
# delete all tags in parameter
#
sub detag
{
  my $in = shift @_;
  $in =~ s|<.*?>||gs;
  return $in;
}


# change the tag for a character style to something else:
# _delete --> delete start tag, end tag, and text between
# _unescape_delete --> unescape and delete tags
# _detag --> remove tags

sub chartag
{
  my $styname = shift;
  my $tagname = shift;
  my $tagend  = shift;

  if ($tagname eq "_unescape_detag") {
    s|<STRING CHARSTYNAME="$styname".*?>(.*?)</STRING>|&unescape($1)|gsei; }
  elsif ($tagname eq "_delete") {
    s|<STRING CHARSTYNAME="$styname".*?>.*?</STRING>||gsi; }
  elsif ($tagname eq "_detag") {
    s|<STRING CHARSTYNAME="$styname".*?>(.*?)</STRING>|$1|gsi; }
  elsif ($tagname eq "_comment") {
    s|<STRING CHARSTYNAME="$styname".*?>(.*?)</STRING>|<!-- $1 -->|gsi; }
  else {
    s|<STRING CHARSTYNAME="$styname"(.*?)>(.*?)</STRING>|<$tagname$1>$2</$tagend>|gsi; }
}


# handle a paragraph style:
# _unescape_detag: unescape and remove tags
# _detag: remove tags
# _delete: delete tags and text
# otherwise, switch to a new tag name

sub partag
{
  my $styname = shift;
  my $tagname = shift;
  my $tagend  = shift;

  if ($tagname eq "_unescape_detag") {
    s|<P STYLENAME="$styname".*?>(.*?)</P>|&unescape($1)|gsie; }
  elsif ($tagname eq "_delete") {
    s|<P STYLENAME="$styname".*?>.*?</P>||gsi; }
  elsif ($tagname eq "_detag") {
    s|<P STYLENAME="$styname".*?>(.*?)</P>|$1|gsi; }
  elsif ($tagend eq "") {
    s|<P STYLENAME="$styname"(.*?)>(.*?)</P>|<$tagname$1>$2|gsi; }
  else {
    s|<P STYLENAME="$styname"(.*?)>(.*?)</P>|<$tagname$1>$2</$tagend>|gsi; }
}


# change one tag to another
sub retag
{
  my $tagname = shift;
  my $tagrep  = shift;
  my $tagend  = shift;

  if ($tagrep eq "_detag")
    { s|</?$tagname\b.*?>||gsi; }
  elsif ($tagrep eq "_delete")
  {
    s|<$tagname\b.*?>.*?</$tagname>\s*||gsi;
    s|<$tagname\b.*?>\s*||gsi;
  }
  elsif ($tagrep eq "_delete_attributes")
  {
    s|(<$tagname\b).*?(/?>)|$1$2|gi;
  }
  else
  {
    s|<$tagname\b(.*?)>|<$tagrep$1>|gsi;
    s|</$tagname\b(.*?)>|</$tagend$1>|gsi;
  }
}


#escape notes so they don't get processed later
sub escape 
{
  my $stuff = shift;
  $stuff =~ s/</\&less-than;/g;
  return $stuff;
}


#change &lt; to <, &gt; to >, &amp; to &
sub unescape
{
  my $stuff = shift;
  $stuff =~ s/(\&lt;)(.*?)(\&gt;)/$1.&dumbquo($2).$3/gse;
  $stuff =~ s/\&lt;/</g;
  $stuff =~ s/\&gt;/>/g;
  $stuff =~ s/\&amp;/\&/g;
  $stuff =~ s/&mdash;/&#8212;/g;
  $stuff =~ s/&ndash;/&#8211;/g;
  return $stuff;
}


#unsmarten (dumbify?) quotes and apostrophes
sub dumbquo
{
  my $stuff = shift;
  $stuff =~ s/\&[rl]dquo;/"/g;
  $stuff =~ s/\&[rl]squo;/'/g;
  $stuff =~ s/\&apos;/'/g;
  return $stuff;
}


# convert a hex unicode code to a decimal escape
sub escapehex
{
  my $code = shift;
  my $dec = hex $code;
  return "&#$dec;";
}


#convert greek (charset=161) to decimal unicode escapes
sub unigreek
{
  my $gk=shift;
# print STDERR "Greek passage: $gk\n";
  $gk =~ s|(\d+)|$1+720|ge;
# print STDERR "...converted to $gk\n";
  return $gk;
}

1;

