#!/usr/local/bin/perl -w
#
# ident2: much faster version of identify.  Uses XML::Parser module.
# However, it only works on XML documents, so the SGML version is 
# lost.  When we go entirely to XML, we'll use this version program.
use strict;
use XML::Parser;

my $file = shift;
die "Can't find file \"$file\"" unless -f $file;

my $level=0;
my ($oldlevel, @id, %ids_used, $sect);
my $par=0;      #paragraph counter
my $tag=0;      #tag counter

my $parser = new XML::Parser(ErrorContext => 2);
$parser->setHandlers(Char    => \&char_handler,
                     Default => \&default_handler,
                     Start   => \&start_handler,
                     End     => \&end_handler );
$parser->parsefile($file);


#############################
sub char_handler
{
  my ($p, $data) = @_;
  print encode($data);
}


sub default_handler
{
  my ($p, $data) = @_;
  print "$data" unless $data =~ m/^\s*$/;
}


sub start_handler
{
  my ($p, $el) = (shift, shift);
  my ($att, $val, %atts);

  print "<$el";		#print out element and current attributes
  while (@_) {
    my $att = shift;
    my $val = shift;
    $atts{$att}=$val;

    if ($att eq "id") { #see if ID has already been used and remember it
      warn "File contains duplicate ID -- $val\n" if $ids_used{$val};
      $ids_used{$val} = 1;
      next;             #process ids below
    }
    print "\n$att=\"$val\""; 
  }

# print "\n***Processing element $el\n";
  if ($el =~ m/^div(.)/) {  #if it's a division, compute new ID
    $oldlevel = $level;
    $level = $1;
    $par=$tag=0;
    $sect = $atts{"id"} || getName($level, %atts); #if existing ID, use it
    print "\nid=\"$sect\"";                        #else compute
    }
  elsif ($sect) {           #for other elements after the first div...
#   print "***Processing non-division $el (sect $sect)\n";
    if ($el =~ /p/i) {
       $tag=0;
       print "\nid=\"" . uniqID("$sect.p$par") . "\"";
       $par++;
    } else {
       print "\nid=\"" . uniqID("$sect.p$par.t$tag") . "\"";
       $tag++;
    }
  }

  print "\n>";
}


# just print out end tags. but how do we get those cute XML-style
# end tags, e.g. <tag/> instead of this expanded form?
sub end_handler
{
  my ($p, $el) = @_;
  print "</$el\n>";
}


############################################################
# 
# this function takes the level and attribute list for
# a div element and returns the current section (e.g. 1.3.2) 
#
# side-effect: modify global $id[] array of current context
# 
sub getName
{
  my $level = shift;
  my %atts = @_;
  my $divtitle = $atts{"title"};
  my $type = $atts{"type"};
  my $n = $atts{"n"};
  my ($sect, $i);

  if ($n)                                 #each level id is n if it exists 
    { $id[$level] = $n; }
  else
    { $id[$level] = $divtitle; }          #if no n, use divtitle as id

  $id[$level] =~ s|&.*?;||g;              #delete entities
  $id[$level] =~ s|[^-_0-9a-zA-Z]||gs;    #delete chars not in list
  $id[$level] =~ s|(.{1,12}).*|$1|;       #limit to 12 characters
 
  $sect = "";
  for ($i="1"; $i le $level; $i++)        #add id for each level
    { $sect .= ".".$id[$i]; }
  $sect =~ s|^\.||;
  $sect = "_$sect";

  return uniqID($sect);
}


# make sure ID used is unique -- if not, hack it.
sub uniqID
{
  my $id=shift;
  if ($ids_used{$id}) {
#   warn "Hey, id $id was already used!\n";
    $id .= "_1";
    $id++ while $ids_used{$id};
#   warn "Let's use $id instead\n";
    }
  $ids_used{$id}=1;
  return $id;
}


#encode <, >, and &
sub encode
{
  my $d = shift;
  $d =~ s/</&lt;/g;
  $d =~ s/>/&gt;/g;
  $d =~ s/&/&amp;/g;
  return $d;
}
