#!/usr/local/bin/perl -w
#
# x/t v0.5, Harry Plantinga, Nov. 8, 1998. This program may be 
# distributed and used under the terms of the Artistic License.
# 
# v0.51 -- modifications to work with version 0.5 of rtf2xml. Most
# noticeable change is that transdoc tags are lower case.
# v0.52 -- added changes from Skip Gaeda to pass text color through
# to XML doc
# v0.53 -- modified to generate lower case HTML tags for Voyager
#
# trn2thm: convert xml (transdoc dtd) to ThML (dtd of xml). This program
# generates legal ThML using CSS style="..." attributes for formatting, 
# but it still needs further processing:  
#   convert to XML
#   change style="" styles to class="" stylesheets
#   increment n= and href= in pb and div
#   adding IDs to elements
#
# Font sizes are expressed in percentages relative to 10pt, so that to
# change all font sizes in proportion it should be sufficient to change 
# the body font size.
#
# to do:
#   map known Greek, Heb fonts to unicode (SIL Ezra, SIL Galatia, etc.)
#
# ideas: use colors, bookmarks, page header and footer, hidden attribute?
#
use strict;
use ThMLutil;

my $input;

while (<>) 
  { $input .= $_; }

my $added;

# Footnotes are embedded inside paragraphs and hence have different
# nesting properties compared to the rest of an RTF file.  So we will
# process footnotes first, then escape them. Process the rest of the 
# file, then unescape.
#print STDERR "Footnotes: ";
$input =~ s|(<FOOTNOTE>.*?</FOOTNOTE>)|&escape(&x2t($1))|gsie;

#sg added -- unwind <string> <string> </string> </string> to 
#to <s> </s><s> </s><s> </s>, where the 3rd <s> is a copy of first
#$input =~ s|(<string[^>]*>)(.*?)(<string[^>]*>)(.*?)(</string>)|&unwind($1,$2,$3,$4,$5)|gse;

# print STDERR "\nRest: ";
$input =  &x2t($input);			#convert tags to thml
$input =~ s|&less-than;|<|gs;
$input =  &cleanup($input);

print $input;
exit(0);

  
sub x2t
{
  $_ = shift;

  # change FIELD HYPERLINK elements into <a href= style links
  s|<FIELD><FLDINST> HYPERLINK "?(.*?)"? </FLDINST><FLDRSLT><STRING.*?>(.*?)</STRING></FLDRSLT></FIELD>|<a href="$1">$2</a>|gi;

# print STDERR "Retagging...";
# convert special rtf2xml tags
  retag('SECTION',	'_detag');
  retag('TRANSDOC',	'_detag');
  retag('FILE',	'_detag');
  retag('PNTEXT',	'_detag');
  retag('ROW',		'tr', 'tr');
  retag('TR',	'_delete_attributes');
  retag('CELL',	'td', 'td');
  retag('TD',	'_delete_attributes');
  retag('FOOTNOTE',	'note place="foot"', 'note');
  retag('!DOCTYPE',	'_detag');
  retag('META',	'_delete');
  retag('table',	'table', 'table');
  retag('p',	'p', 'p');
  retag('bkmkend',	'_detag');	# delete bookmarks
  retag('bkmkstart',	'_detag');
  retag('page-break',	'_detag');	# delete page breaks
  retag('header',	'_delete');	# delete page header
  retag('footer',	'_delete');	# delete page footer

  #delete the initial entity declaration
  s|^.*ENTITY.*]>||si;
  #handle unicode -- convert to decimal escapes 
  s|<UNICHAR VALUE="x(.*?)".*?>|escapehex($1)|gie;
  #convert greek to decimal unicode escapes
  s|<STRING[^>]*CHARSET="161">(.*?)</STRING>|unigreek($1)|gsie;
  s|&tab;| |g;			#change tab entities to spaces
  s|<STRING[^>]*>&#183; </STRING>|&#8226; |gsi;	#bullet
  s|<TABLE>|<table border="2">|gi;

# print STDERR "styles...";
  s|(<p[^>]+>)|cssstyles($1)|gsie;	#change paragraph style atts to CSS
  s|(<STRING[^>]+>)|cssstyles($1)|gsie;	#change character style atts to CSS

# change named character style (first parameter) to tags 
  chartag('Citation',	'cite', 'cite');
  chartag('Code',	'code', 'code');
  chartag('Comment',	'_comment');
  chartag('Default',	'_detag');	#is this a bug? need a span here?
  chartag('XML',	'_unescape_detag');
  chartag('Name',	'name',	'name');
  chartag('Unclear',	'unclear', 'unclear');

  # change remaining STRINGs to span elements 
  #***bug here on nested strings.  need to handle recursively or parse through
  s|<STRING([^>]*?)CHARSTYNAME="(.*?)"(.*?)>(.*?)</STRING>|<span class="$2" $1 $3>$4</span>|gsi; 				      #named
  s|<STRING(.*?)>(.*?)</STRING>|<span$1>$2</span>|gsi; 	#unnamed

# change named paragraph styles (first parameter) to tags
  #do footnotes first since they're nested
  partag('footnote text','p class="Footnote"', 'p');
  partag('Attribution', 'attr', 'attr');
  partag('BlockQuote',	'blockquote><p', 'p></blockquote');
  partag('P_Continue',	'p class="Continue"', 'p');
  partag('P_First',	'p class="First"', 'p');
  partag('P_Resume',	'p class="Resume"', 'p');
  partag('HeaderInfo',	'_unescape_detag');
  partag('header',	'_delete');
  partag('heading 1',	'h1', 'h1');
  partag('heading 2',	'h2', 'h2');
  partag('heading 3',	'h3', 'h3');
  partag('heading 4',	'h4', 'h4');
  partag('heading 5',	'h5', 'h5');
  partag('heading 6',	'h6', 'h6');
  partag('hr',	'hr', 'hr');
  partag('hr30',	   'hr class="W30"', 'hr');
  partag('List',	   'p class="list1"', 'p');
  partag('List 2',	   'p class="list2"', 'p');
  partag('List 3',	   'p class="list3"', 'p');
  partag('List 4',	   'p class="list4"', 'p');
  partag('List Bullet',	   'p class="list1"', 'p');
  partag('List Bullet 2',  'p class="list2"', 'p');
  partag('List Bullet 3',  'p class="list3"', 'p');
  partag('List Bullet 4',  'p class="list4"', 'p');
  partag('List Number',	   'p class="list1"', 'p');
  partag('List Number 2',  'p class="list2"', 'p');
  partag('List Number 3',  'p class="list3"', 'p');
  partag('List Number 4',  'p class="list4"', 'p');
  partag('List Continue',  'p class="listCont1"', 'p');
  partag('List Continue 2','p class="listCont2"', 'p');
  partag('List Continue 3','p class="listCont3"', 'p');
  partag('List Continue 4','p class="listCont4"', 'p');
  partag('Term',	'term', 'term');
  partag('Definition',	'def', 'def');
  partag('SectionInfo',	'p class="sectionInfo"','p');
  partag('Verse',	'l class="1"', 'l');
  partag('Verse 1',	'l class="1"', 'l');
  partag('Verse 2',	'l class="2"', 'l');
  partag('Verse 3',	'l class="3"', 'l');
  partag('Verse 4',	'l class="4"', 'l');
  partag('Verse 5',	'l class="5"', 'l');
  partag('Verse Center','l class="c"', 'l');
  partag('Verse Right',	'l class="r"', 'l');

  # change remaining paragraph styles named xxx to P class="xxx" 
  s|STYLENAME=|class=|gi;
  # clean up HRs -- delete styles
  s|<HR( class=".*?").*?>|<hr$1>|g;

# print STDERR "cleanup...";

  # change empty <l></l> elements to <verse> (should be smarter)
  # this requires that every verse be preceeded by <l></l>.
# s|<l\b([^>]*)></l>|<verse$1>|gs; #add <verse> around <l> tags
  # now add </verse>
# s|(<verse[^>]*(\s*<l[^>]*>.*?</l>\s*))|$1</verse>\n|gsi;

  s|(<[^>]*?)\s\s+([^>]*?>)|$1 $2|gs;		#collapse whitespace in tags
  s|="\s*([^"]*)\s*"|="$1"|g;			#rm initial, final att spaces
  s|<span\s*>(.*?)</span>|$1|gsi;		#delete useless spans
  s|style=" |style="|g;				#delete initial style spaces

  #eventually should handle other charsets ...
  s|CHARSET=".*?"||gi;

  return $_;
}


sub cleanup 
{
  $_ = shift;

  s|(.*?<ThML.body>)||s;			#clean head section
  my $head = $1;
  $head =~ s|</?span.*?>||gsi;			#delete spans in head
  $head =~ s|<!--.*?-->||gsi;			#delete comments in head
  $head = "<?xml version=\"1.0\"?>".$head;      #add XML tag
  $_ = $head . $_;

  # delete <P> or <H?> that only contain whitespace & e.g. div,deleted 
  s@<(p|h[1-6])\b[^>]*>((\s*(<(!--|/?div|/?deleted|/?added|/?glossary|pb|scripContext|index|insertIndex|/?verse)[^>]*?>)\s*)+)</\1>@$2@gsi; 

  # delete anything between <ThML.body> and <div
  s|(<ThML.body>).*?(<div)|$1$2|s;

  s#<(p|h[1-6])([^>]*?>)(</\1>)#<$1$2&nbsp;$3#gs;#add &nbsp; in blank P or H?
  s|\n{3,}|\n\n|g;			#compress blank lines
  s|\&line;|<br/>|g;			#what to do with line entity?
  s|\&\#133;|...|g;			#fix up some ms-specific chars
  s|\&\#147;|\&ldquo;|g;
  s|\&\#148;|\&rdquo;|g;
  s|\&\#156;|\&oelig;|g;
  s|\&\#167;|\&sect;|g;
# s|(<[^>]*?)\&ldquo;|$1"|gs; 		#dumb-down quotes in tags
  s|=\&[rl]dquo;(.*?)\&[rl]dquo;|="$1"|gs;

  $_ .= "</ThML.body>\n</ThML>\n" unless m|</ThML.body>|;

  #clean up after a word 2000 beta 1 bug
  s|(</tr>)\s*</tr>|$1|gs;
  return $_;
}


#---------------------subroutines-----------------
sub cssstyles
{
  my $tag = shift;
  my $styles = "";
  my $newsize = "";
  my $oldsize = "10";

# print STDERR "In parstyles: processing $tag\n";

  $styles .= "font-family: '$1'; " if $tag =~ s|FONTNAME="(.*?)"||i; 
  $oldsize = $1 if $tag =~ m|FONTSIZE="(.*?)"|i;
  $newsize = ($oldsize*50)/10;		#% of 10pt, eg. 12pt --> 120%
  $newsize =~ s/\..*//;
  $styles .= "font-size: ".$newsize . "%; " if $tag =~ s|FONTSIZE="(.*?)"||i; 
  $styles .= "font-weight: bold; "          if $tag =~ s|BOLD="ON"||i; 
  $styles .= "font-weight: normal; "        if $tag =~ s|BOLD="OFF"||i; 
  $styles .= "font-style: italic; "         if $tag =~ s|ITALIC="ON"||i; 
  $styles .= "font-style: normal; "         if $tag =~ s|ITALIC="OFF"||i; 
  $styles .= "text-align: left; "           if $tag =~ s|ALIGN="LEFT"||i; 
  $styles .= "text-align: center; "         if $tag =~ s|ALIGN="CENTER"||i; 
  $styles .= "text-align: right; "          if $tag =~ s|ALIGN="RIGHT"||i; 
  $styles .= "text-align: justify; "        if $tag =~ s|ALIGN="JUSTIFY"||i; 
  $styles .= "text-decoration: underline; " if $tag =~ s|UNDERLINE="ON"||i; 
  $styles .= "text-decoration: none; "      if $tag =~ s|UNDERLINE="OFF"||i; 
  $styles .= "vertical-align: sub; "        if $tag =~ s|SUBSCRIPT="ON"||i; 
  $styles .= "vertical-align: normal; "     if $tag =~ s|SUBSCRIPT="OFF"||i; 
  $styles .= "vertical-align: super; "      if $tag =~ s|SUPERSCRIPT="ON"||i; 
  $styles .= "vertical-align: normal; "     if $tag =~s|SUPERSCRIPT="OFF"||i; 

  $tag =~ s|hidden=".*?"||;	#ignore hidden attribute
# $tag =~ s|color=".*?"||;	#ignore color attribute
  $styles .= "color: black; "       if $tag =~s|COLOR="1"||i; 
  $styles .= "color: blue; "        if $tag =~s|COLOR="2"||i; 
  $styles .= "color: turquoise; "   if $tag =~s|COLOR="3"||i; 
  $styles .= "color: lime; "        if $tag =~s|COLOR="4"||i; 
  $styles .= "color: magenta; "     if $tag =~s|COLOR="5"||i; 
  $styles .= "color: red; "         if $tag =~s|COLOR="6"||i; 
  $styles .= "color: yellow; "      if $tag =~s|COLOR="7"||i; 
  $styles .= "color: white; "       if $tag =~s|COLOR="8"||i; 
  $styles .= "color: darkblue; "    if $tag =~s|COLOR="9"||i; 
  $styles .= "color: teal; "        if $tag =~s|COLOR="10"||i; 
  $styles .= "color: darkgreen; "   if $tag =~s|COLOR="11"||i; 
  $styles .= "color: darkviolet; "  if $tag =~s|COLOR="12"||i; 
  $styles .= "color: darkred; "     if $tag =~s|COLOR="13"||i; 
  $styles .= "color: darkyellow; "  if $tag =~s|COLOR="14"||i; 
  $styles .= "color: gray; "        if $tag =~s|COLOR="15"||i; 
  $styles .= "color: lightgray; "   if $tag =~s|COLOR="16"||i; 

  $tag =~ s|>| style="$styles">| if $styles;
  return $tag;
}


# fix up any nested strings by splitting/unwinding.
sub unwind
{
  my @strs = @_;
  my $str = "";
  my $flag= 0;
  my $out;

  $flag = 1 if $strs[1] !~ m|</string>|;
  $flag+= 2 if $strs[3] =~ m|<string|;
  return join("",@strs) unless $flag;
  die "unwind: flag=3\n" if $flag > 2;

  warn "Found nested string: unwound.  Check output\n";
  print "*******unwind*******\n";
  print "** warning: this unwind code untested.  check results! ** ";
  print " [0] $strs[0]\n";
  print " [1] $strs[1]\n";
  print " [2] $strs[2]\n";
  print " [3] $strs[3]\n";
  print " [4] $strs[4]\n";
  
  #flag=1: <s1>  <s2>  </s2> -- add </s1> and <s1> at end
  $out = $strs[0].$strs[1];
  if ($flag & 1) {
    print "case 1\n";
    $out .= "</string>";
    $str = $strs[0];
  }
  $out .= $strs[2];

  #flag=2: <s1> </s1><s2> <s3></s3> -- add </s2> and <s2> at end
  if ($flag & 2) {
    print "case 2\n";
    $strs[3] =~ s|<string|</string><string|;
    $str = $strs[2];
  }

  $out .= $strs[3].$strs[4].$str;
  print "returning $out\n";
  return $out;
}

  
