#!/usr/bin/perl -w
#
# Script to convert the GAP manual to HTML
# usage convert.pl [-cs] <doc-directory> [<html-directory>]
#
# Caveats: 
#
#  1. This script assumes that the .toc file is up-to-date with the .tex files
#     and will almost certainly fail horribly if this is not true
#
#  2. The output files are CxxxSxxx.htm, (not .html) plus chapters.htm,
#     theindex.htm and biblio.htm. A (front page) 
#     file index.htm is assumed, but not created
#     Not all servers will serve .htm files as HTML without adjustments
#
#  3. The script assumes that the .tex files comply with GAP conventions, including
#     unwritten ones. It tries to follow the behaviour of the on-line browser
#
#  Options:
#
#    -c  file-per-chapter mode -- generates one HTML file CHAPxxx.htm for each chapter
#        sections are level 2 headings and anchors CHAPxxx.htm#SECTxxx. This is intended
#        for local browsing, especially under MS-DOS
# 
#    -s  silent running. Conversational messages are suppressed.
#
#    html-directory defaults to the current director,
#

# Check PERL version
#
$] > 5 or die "Needs perl 5";

use Getopt::Std;


#
# Global variables 
#
#  $dir  -- the full pathname of the input directory, including a trailing /
#  $odir -- the full pathname of the output directory, including a trailing /
#  $opt_c and $opt_s set by getopts()
#  @chapters -- the chapters data structure
#  IN    -- the current input file (outputfiles are handled by select)
#  $footer -- the trailer put on every page
#  $indexcount -- used within chapters to number the index anchors
#


# getchaps:
#
# Scan the .tex and .toc files to get chapter names and numbers, 
# section names and numbers and associated filenames Loads up chapters and
# sections_by_name
#

# These match chapter and section lines in a .toc file
#

$chapexp = '\\\\numberline\s+\{(\d+)\}(.+)\}\s*\{\d+\}';
$secexp = '\\\\numberline\s+\{(\d+)\.(\d+)\}(.+)\}\s*\{\d+\}';

#
# used to standardize section names for use as hash indices.
# 

sub canonize {
    my ($key) = @_;
    $key =~ tr/A-Z/a-z/;
    $key =~ s/\s//g;
    $key;
}

sub getchaps {
    open TOC, ( "${dir}manual.toc" ) || die "Can't open manual.toc";
    my ($chap,$sec,$chapno,$chap_as_sec);
    while (<TOC>) {
	if ( /$chapexp/o ) {
	    $chap = {name => $2, 
		     number => $1};
	    if (defined ($chapters[$1])) {
		die ("chapter number repeated");
	    }
	    $chapters[$1] = $chap;
	    $chap_as_sec = {name => $2, chapnum => $1, 
			    secnum => 0, chapter => $chap};
	    $chap->{sections}[0] = $chap_as_sec;
	    $sections_by_name{canonize $2} = $chap_as_sec;
	} elsif ( /$secexp/o ) {
	    if (not defined ($chapters[$1])) {
		die ("section in unknown chapter");
	    }
	    if (defined ( $chapters[$1]{sections}[$2])) {
		die "section number repeated";
	    }
	    $sec = {name => $3, 
		    secnum => $2, 
		    chapnum => $1, 
		    chapter => $chapters[$1]};
	    $sections_by_name{canonize $3} = $sec;
	    
	    $chapters[$1]{sections}[$2] = $sec;
	} else {
	    print STDERR "Bad line: $_";
	}
    }
    close TOC;
    open TEX, ("${dir}manual.tex") || die "Can't open manual.tex";
    $chapno = 0;
    while (<TEX>) {
	if ( /^[^%]*\\Include\{(.+)\}/ ) {
	    if (not -f "$dir$1.tex" or not -r "$dir$1.tex") {
		print STDERR "Chapter file $1.tex does not exist in $dir\n";
	    }
	    $chapters[++$chapno]{file} = $1;
	}
    }
    close TEX;
}

#
# Mainly diagnostic, prints the chapters data structure. Also
# checks that each section has the correct back reference to its
# chapter
#

sub printchaps {
    my @chapters = @_;
  CHAP: foreach $chapter (@chapters) {
      next CHAP unless (defined ($chapter));
      print "Chapter $chapter->{number} $chapter->{name} $chapter->{file}\n";
    SECT: foreach $section (@{$chapter->{sections}}) {
	next SECT unless defined ($section);
	print "    Section $section->{chapnum}.$section->{secnum} $section->{name}\n";
	if ($section->{chapter} ne $chapter ) {
	    print "       loop problem\n";
	}
    }
      
  }
}

# Printed at the bottom of every page

$footer = "<P>\n<address>GAP 3.4.4<br>April 1997</address></body></html>";



# The names of the section and chapter files are determined by this routine

sub name2fn {
    my ($name) = @_;
    my $sec = $sections_by_name{canonize $name};
    unless (defined ( $sec)) {
	return "badlink.htm#$name";
    }
    my ($cnum,$snum) = ($sec->{chapnum},$sec->{secnum});
    $cnum = "0" x (3 - length $cnum) . $cnum;
    $snum = "0" x (3 - length $snum) . $snum;
    if ($opt_c) {
	if ($snum eq "000") {
	    return "CHAP${cnum}.htm";
	} else {
	    return "CHAP${cnum}.htm#SECT${snum}";
	}
    } else {
	return "C${cnum}S$snum.htm";
    }
}


# This routine is called to process the text of the section
# the output file is assumed to be pre-selected. The input filehandle
# is simply IN
# 
# As we process, we can be in "normal" status (text), "maths" status 
# inside $ ... $, "verbatim" status inside a multi-line example or
# "shortverb" status inside a short |..|
#
# We separately track whether we are in bold or tt, 
# whether we are in a xxx: .... paragraph and whether we are reading
# a cross-reference that is split across multiple lines
#
# Finally, we track whether we have already
# emitted a <P> for this group of blank lines
#


$LaTeXbinops = "in|wedge|vee|cup|cap|otimes|oplus|le|ge|rightarrow";
$EndLaTeXMacro = "(?![A-Za-z])";

#
# This could probably be done more cleverly -- this routine is too long
#

sub convert_text {
    my $fname = $_[0];
    my $refchars = '[\\w\\s-.$]'; # characters that can appear in a cross reference
    my ($status) = "normal";
    my ($bold, $tt) = (0,0);
    my ($inlist) = "0";
    my ($inref) = "0";
    my ($ref) = "";
    my ($donepar) = "1";

    #
    # Now we loop over lines. a line with 16 initial % signs marks 
    # end of section
    #

  LINE: while ($_ = <IN> and not /^\%{16,}/) {
      chomp;			# drop the trailing newline
      my $outline = "";		# build the output in here
      # first we deal with various special whole lines

      # blank lines

      if ($_ eq "") {
	  if ($inlist) {
	      $outline .= "</DL>"; 
	      $inlist = 0;
	  }
	  unless ($donepar) {
	      $outline .=  "<P>\n";
	      $donepar = 1;
	  }

	  # If we get to the end of a paragraph we assume that we have lost
	  # track of what is going on, warn and try to resume.
	  # This happens once in the 3.4.3 manual

	  if ($status eq "maths" or $status eq "shortverb" or $inref) {
	      print STDERR "Paragraph ended in status $status at $.\n" .
		  "reverting to normal\n";
	      $outline .= "</I>" if ($status eq "maths");
	      $status = "normal";
	  }
	  print $outline;
	  next LINE;
      }

      # index entries -- emit an anchor and remember the index keys for later
      # there may be several on one line and several references to one key

      if  ( /^\\index/ ) {
	  $outline .= "<A name = \"I$indexcount\"></a>\n";
	  while (/\\index\{(.*?)\}/g) {
	      unless (defined $index{$1}) {
		  $index{$1} = [];
	      }
	      push @{$index{$1}}, [ "$fname#I$indexcount", 
				   "$sec->{chapnum}.$sec->{secnum}" ];
	  } 
	  $indexcount++;
	  next LINE;
      } 

      if (/^\\(newpage|begin\{|end\{)|^\%/) {

	  # ignore all of these
	  next LINE;

      } 
      if (/^\\vspace\{/) {

	  # extra para break for \vspace
	  $outline .= "<P>";
	  next LINE;
      } 

      # Here we have a "non-special" line to process
      # We scan it for special characters and deal with them individually
      # $rest contains the text that we have yet to look at
      # We accumulate the output in $outline, rather than printing it
      # because a : requires us to back up to start of line

      $donepar = 0;
      my $rest = $_;

      # The (rare) situation that we are processing a multi-line
      # cross reference is handled specially

      if ($inref) {

	  # if it finishes on this line emit the link
	  # otherwise keep accumulating it

	  if ($rest =~ /^$refchars+\"/o) {
	      $rest = $';
	      chop($ref .= $&);
	      $ref1 = name2fn $ref;
	      $outline .= "<a href=\"$ref1\">$ref</a>";
	      $inref = "0";
	  } elsif ($rest =~ /^$refchars*$/o) {
	      $ref .= "$rest ";
	      next LINE;
	  } else {
	      die "Bad reference. So far $ref, now got $rest";
	  }
      }

      # The main case, scan for special characters

    SPECIAL: while ( $rest =~ /[\\{}\$|<>\'*:\"&%]/ ) {
	$outline .= $`;		# the part that we scanned past
	$rest = $';		# the remainder
	my $matched = $&;	# the character matched

	# backslash: \GAP etc treated specially, as are some maths forms
	# \= and \> are ignored completely. Otherwise \ is passed
	# through in verbatim and escapes the next character otherwise
	# \\ at the end of a line is a line break.
	#
	if ($matched eq "\\") { 
	    if ($status =~ /verb/) {
		$outline .= "\\";
		next SPECIAL;
	    }
	    # \GAP etc
	    if ($rest =~ /^(GAP|CAS|ATLAS|Z|Q|R)$EndLaTeXMacro/o) {
		$rest = $';
		$outline .= "<strong>$&</strong>";
		next SPECIAL;
	    } 
	    # citations
	    if ($rest =~ /^cite\s*\{\s*(\w+)\s*\}/) {
		$rest = $';
		$outline .= "<A href=\"biblio.htm#$1\"><cite>$1</cite></a>";
		next SPECIAL;
	    } 

	    #
	    # Try to get nice spacing around certain maths constructs that
	    # are used a lot
	    #

	    if ($status eq "maths") {
		# binary operators (in, vee ,cup, cap, etc)
		if ($rest =~/^($LaTeXbinops)$EndLaTeXMacro/o) {
		    $outline .= " $1 ";
		    $rest = $';
		    next SPECIAL;
		} 
		# \backslash
		if ($rest =~/^backslash$EndLaTeXMacro/o) {
		    $outline .= " \\ ";
		    $rest = $';
		    next SPECIAL;
		} 
		# \split
		if ($rest =~/^split$EndLaTeXMacro/o) {
		    $outline .= ":";
		    $rest = $';
		    next SPECIAL;
		} 
		# angle brackets
		if ($rest =~/^langle$EndLaTeXMacro/o) {
		    $outline .= " <";
		    $rest = $';
		    next SPECIAL;
		} 
		if ($rest =~ /^rangle$EndLaTeXMacro/o) {
		    $outline .= "> ";
		    $rest = $';
		    next SPECIAL;
		}
	    } elsif ($rest =~ /^[\\]/) {
		# \\ outside maths forces a new line
		$rest = $';
		$outline .= "<BR>";
		next SPECIAL
		} 
	    # ignore  \= and \>  anywhere
	    if ($rest =~ /^[=>]/) {
		$rest = $';
		next SPECIAL;
	    }
	    # ignore \ at end of line
	    if ($rest eq "")  {
		print "$outline\n";
		next LINE;
	    }
	    #
	    # default backslash handling, pass next character through directly
	    #
	    $outline .= substr($rest,0,1);
	    $rest = substr($rest,1);
	    next SPECIAL;
	}

	# Braces are passed through in maths or verbatim, 
	# ignored otherwise
	if ($matched =~ /[{}]/) {
	    if ($status =~ /verb/ or 
		$status eq "maths") {
		$outline .= $matched;
	    }
	    next SPECIAL;
	}

	# $ prints in verbatim and toggles maths mode otherwise
	if ($matched eq "\$") {
	    if ($status=~ /verb/) {
		$outline .= $matched;
		next SPECIAL;
	    }
	    if ($status eq "maths") {
		$status = "normal";
		$outline .= "</I>";
		next SPECIAL;
	    } 
	    $status = "maths";
	    $outline .= "<I>";
	    next SPECIAL;
	}

	# | toggles one or other verbatim mode
	# to decide which, we look ahead to try and spot a matching |
	# on the same line. 

	if ($matched eq "|") {
	    if ($status eq "verbatim") {
		$status = "normal";
		$outline .= "</pre>";
		next SPECIAL;
	    } 
	    if ($status eq "shortverb") {
		$status = "normal";
		$outline .= "</code>";
		next SPECIAL;
	    } 
	    if ($status eq "maths") {
		$outline .= "|";
		next SPECIAL;
	    } 
	    if ($rest =~ /\|/) {
		$status = "shortverb";
		$outline .= "<code>";
		next SPECIAL;
	    } 
	    $status = "verbatim";
	    $outline .= "<pre>";
	    next SPECIAL;
	}

	# < > open and close italic variable names, in normal mode
	# otherwise we have to translate them for HTML

	if ($matched eq "<") {
	    if ($status eq "normal") {
		$outline .= "<var>";
	    } else {
		$outline .= "&lt;";
	    }
	    next SPECIAL;
	} 
	if  ($matched eq ">") {
	    if ($status eq "normal") {
		$outline .= "</var>"; 
	    } else {
		$outline .= "&gt;";  
	    }
	    next SPECIAL;
	}

	# * in normal mode toggles bold-face    

	if ($matched eq "*") {  
	    if ($status eq "normal") {
		if ($bold) {  
		    $outline .= "</strong>"; 
		    $bold = 0;
		} else {
		    $outline .= "<strong>"; 
		    $bold = 1;
		}
	    } else {
		$outline .= "*";
	    }
	    next SPECIAL;
	}
	# ' in normal mode toggles typewriter

	if ($matched eq "\'") {
	    if ($status eq "normal") {
		if ($tt) {
		    $outline .= "</code>";
		    $tt = 0;
		} else {
		    $outline .= "<code>";
		    $tt = 1;
		}
	    } else {
		$outline .= "\'";
	    }
	    next SPECIAL;
	}
	# : signals a definition. We go back to start of line
	# for the tag, and on to end of para for the definition
	#
	# We do not merge adjacent definitions into the same list

	if ($matched eq ":") {
	    if ($status ne "normal" or $inlist) {
		$outline .= $matched;
		next SPECIAL;
	    } 
	    $outline = "<DL><DT>" . $outline . ":<DD>";
	    $inlist = 1;
	    next SPECIAL;
	}
	# " starts a cross-reference. If it ends on the same input line
	# then we can deal with it at once otherwise we set $inref

	if ($matched eq "\"") {
	    if ($status =~ /verb/) {
		$outline .= "\"";
		next SPECIAL;
	    } 
	    if ($rest =~ /^$refchars+\"/o) {
		$rest = $';
		chop($ref = $&);
		$ref1 = name2fn $ref;
		$outline .= "<a href=\"$ref1\">$ref</a>";
		next SPECIAL;
	    }
	    if ($rest =~ /^$refchars*$/o) {
		$ref = "$rest ";
		$inref = 1;
		next LINE;
	    } 
	    die "Bad reference $rest at $_";
	}

	# & translates to # in verbatim mode

	if ($matched eq "&") {
	    if ($status eq "verbatim") {
		$outline .= "#";
	    } else {
		$outline .= "&";
	    }
	    next SPECIAL;

	}
	
	# ignore from % to end of line, in non-verbatim modes
	# the on-line browser does not do this
	if ($matched eq "%") {
	    if ($status =~ /verb/) {
		$outline .= $matched;
		next SPECIAL;
	    }
	    print $outline."\n";
	    next LINE;
	}
	    
    }				# SPECIAL
      print $outline.$rest."\n";
  }      # LINE
}



sub startfile {
    my $sec = $_[0];
    my ($num, $name, $re, $fname, $name1);
    if ($sec->{secnum} == 0) {
	$num = $chap->{number};
	$name = $chap->{name};
	$name1 = quotemeta $name;
	$re = "^\\\\Chapter\\{$name1\\}";
    } else {
	$num = $sec->{chapnum} . "." .$sec->{secnum};
	$name = $sec->{name};  
	$name1 = quotemeta $name;
	$re = "^\\\\Section\\{$name1\\}";
    }
    $fname = name2fn $sec->{name};
    if ($fname =~ /\#/) { die "Filename contains #"};
    open OUT, ">${odir}${fname}";
    select OUT;
    print  "<html><head><title>GAP Manual: $num $name</title></head>\n";
    print  "<body bgcolor=\"ffffff\">\n<h1>$num $name</h1>\n<p>";
    ($fname, $re);
}

sub startsubsec {
    my $sec = $_[0];
    my $snum = $sec->{secnum};
    my $num = $sec->{chapnum} . "." .$snum;
    $snum = "0" x (3 - length $snum) . $snum;
    my $name = $sec->{name};  
    my $name1 = quotemeta $name;
    print "<A NAME=\"SECT$snum\"><h2>$num $name</h2></a>\n<P>";
    return "^\\\\Section\\{$name1\\}";
}

sub sectionlist {
    my $chap = $_[0];
    my $subsec;
    print  "<P>\n<H3> Subsections</H3>\n<oL>\n";
  SUBSEC: for $subsec (@{$chap->{sections}}) {
      next SUBSEC if ($subsec->{secnum} == 0);
      my $link = name2fn $subsec->{name};
      print  "<LI> <A HREF=\"$link\">$subsec->{name}</a>\n";
  }
    print  "</ol>\n";
}

#
# Basically the chapter file is read in one pass, using information previously 
# read from the .toc file to fill in next and previous pointers and the like
#

sub navigation {
    my $sec = $_[0];
    my $chap = $sec->{chapter};
    my $cfname = name2fn $chap->{name};
    if ($sec->{secnum} == 0) {
	if ($chap->{number} != 1) {
	    my $prev = name2fn $chapters[$chap->{number} - 1]{name};
	    print  "<a href =\"$prev\">Previous</a> ";
	}
	print  "<a href = \"chapters.htm\">Up</a> ";
	if ($chap->{number} != $#chapters) {
	    my $next = name2fn $chapters[$chap->{number} + 1]{name};
	    print  "<a href =\"$next\">Next</a>";
	}
    } else {
	if ($sec->{secnum} != 1) {
	    my $prev = name2fn $chap->{sections}[$sec->{secnum} - 1]{name};
	    print  "<a href =\"$prev\">Previous</a> ";
	}
	print  "<a href = \"$cfname\">Up</a> ";
	print  "<a href = \"index.htm\">Top</a> ";
	if ($sec->{secnum} != $#{$chap->{sections}}) {
	    my $next = name2fn $chap->{sections}[$sec->{secnum} + 1]{name};
	    print  "<a href =\"$next\">Next</a>";
	}
    }
    print  "<BR><a href = \"theindex.htm\">Index</a>\n";
}
    

sub endpage {
    print  $footer;
    close OUT;
    select STDOUT;
}    


sub convert_chap {
    my ($chap) = @_;
    my $re;
    my $fname;
    $indexcount = 0;
    open IN, $dir.$chap->{file}.".tex";
    $_ = <IN>;

    # loop, controlled by the list of sections that we expect
    # will fail, possibly messily if this does not match reality

    if ($opt_c) {		# each chapter in a single file
	($fname,$re) = startfile $chap->{sections}[0];
    }

  SECT: for $sec (@{$chap->{sections}}) {

      # sort out what we are processing (chapter or section)
      # produce the header of the Web page

      if ($opt_c) {
	  $re = startsubsec $sec unless ($sec->{secnum} == 0);
      } else {
	  ($fname, $re) = startfile $sec;
      } 

      #
      # Look for the \Chapter or \Section line
      #

      while ( $_ !~ /$re/) {
	  unless ($_ = <IN>) { 
	      die "Missing chapter or section line matching $re" };
      };

      convert_text($fname);

      # Here we have processed the whole section and start to attach footers
      # to it. If it is really a chapter then it gets a list of its sections

      if ($sec->{secnum} == 0) {
	  sectionlist $chap;
      }
      unless ($opt_c) {
	  navigation $sec;
	  endpage;
      }
  }
    if ($opt_c) {
	navigation $chap->{sections}[0];
	endpage;
    }
    close IN;
}



sub chapters_page {
    open OUT, ">${odir}chapters.htm";
    select OUT;

    print  <<END
<html><head><title>The GAP Manual -- Chapters</title></head>
<body bgcolor=\"ffffff\"><h1>The GAP Manual -- Chapters</h1><ol>
END
    ;

  CHAP: for $chap (@chapters) {
      unless (defined $chap) { next CHAP};
	my $link = name2fn $chap->{name};
	print  "<LI> <A HREF=\"$link\">$chap->{name}</a>\n";
    }

    print  <<END
</ol>\n<p>\n<a href=\"biblio.htm\">References</a><p>
<a href=\"theindex.htm\">Index</a><p>
<a href=\"index.htm\">Up</a><P>
END
    ;

    print  $footer;
    close OUT;
    select STDOUT;
}
    
sub caseless { lc($a) cmp lc ($b) or $a cmp $b }

sub index_page {
    my ($ent, $ref, $letter, $nextletter);
    open OUT, ">${odir}theindex.htm";
    select OUT;
    print <<END
<html><head><title>The GAP Manual -- Index</title></head>
<body bgcolor=\"ffffff\"><h1>The GAP Manual -- Index</h1>
<p>
END
    ;
    foreach $letter  ("A".."Z") {
	print  "<a href=\"theindex.htm#L$letter\">$letter</A> ";
    }
    print  "\n<ul>";

    $nextletter = "A";
	
  ENTRY: for $ent (sort caseless keys %index) {
      $letter = uc(substr($ent,0,1));
      if ($nextletter le "Z") {
	  until ($letter lt $nextletter) {
	      print  "<A name = \"L$nextletter\"></a>";
	      $nextletter++;
	  }
      }
      $ent1 = $ent;
      $ent1 =~ s/!/ /;
      print  "<LI>$ent1 ";
      for $ref (@{$index{$ent}}) {
	  print  "<A HREF=\"$ref->[0]\">$ref->[1]</A> ";
      }
      print  "\n";
    }
    print  "</ul>\n<p>\n";
    print  "<a href=\"index.htm\">Up</a><P>\n";
    print  $footer;
    close OUT;
    select STDOUT;
}

#
# Main program starts here
#
# Process option and sort out input and output directories   
#

getopts('cs');

chomp($dir = shift @ARGV);
if (substr($dir,0,1) ne "/") {
    $dir = `pwd` . "/" . $dir;
    $dir =~ s/\n//;
}
if (substr($dir,-1) ne "/") {
    $dir .= "/";
}
unless (-d $dir and -r $dir) {
    die "Can't use input directory $dir";
}
print  "Reading input from $dir\n" unless ($opt_s);

getchaps;
print  "Processed TOC file\n" unless ($opt_s);

if ($#ARGV != -1) {
    chomp($odir=shift @ARGV);
} else {
    $odir = "";
}

if (substr($odir,0,1) ne "/") {
    $odir = `pwd` . "/" . $odir;
    $odir =~ s/\n//;
}
if (substr($odir,-1) ne "/") {
    $odir .= "/";
}
unless (-d $odir and -w $odir) {
    die "Can't use output directory $odir";
}
print  "Creating output in $odir\n" unless ($opt_s);


#
# OK go to work
#

CHAP: foreach $chap (@chapters) {
    unless (defined $chap) {
	next CHAP;
    }
    print  "$chap->{name}\n" unless ($opt_s);
    convert_chap $chap;
}

print  "and the chapters page\n" unless ($opt_s);
chapters_page;
print  "and the index page\n" unless ($opt_s);
index_page;
print  "now calling the other script to do the bibliography\n" unless ($opt_s);
system "./fixbib.pl ${dir}manual.bib ${odir}biblio.htm";
print  "done\n" unless ($opt_s);





	    
    

    
	
    

