#!/usr/bin/perl -w

# $Id: lout2html,v 1.14 1999/11/15 20:16:13 root Exp root $

# Copyright (c) 1999 Mark Summerfield. All Rights Reserved.
# May be used/distributed under the GPL.

use strict ; 

use integer ;

use Getopt::Long ;
use Lout ;
use HTML::Entities ;

use vars qw( $VERSION ) ;
$VERSION    = '1.06-alpha' ;

my $Wrapper = 1 ;
my $Verbose = 1 ;


# setup data

my $TEXT    = '__TEXT__' ;
my $ID      = '__ID__' ;
my $CHAR    = '__CHAR__' ;

my $ESC     = "\x00" ;
my $AT      = "\x01" ;
my $BSLASH  = "\x02" ;
my $FSLASH  = "\x03" ;
my $DQUOTE  = "\x04" ;
my $BAR     = "\x05" ;
my $HASH    = "\x06" ;
my $AND     = "\x07" ;
my $X       = "\xF8" ;
my $Y       = "\xF9" ;
my $LBRACE  = "\xFC" ;
my $RBRACE  = "\xFD" ;
my $HAT     = "\xFE" ;
my $TILDE   = "\xFF" ;

my %Entity2char = %Lout::Entity2char ;
delete @Entity2char{'amp', 'gt', 'lt', 'quot'} ;
@Entity2char{'@Bullet', '@CDot', '@Char bullet'} = split ' ', 'middot ' x 3 ; 
$Entity2char{'@TradeMark'}   = 'trade' ; 

my %Char2entity = map { 
        $Entity2char{$_} =~ s/[{}]//go ;
        $Entity2char{$_} =~ /^\@Char/o ? ( $Entity2char{$_}, "&" . "$_;" ) : () ; 
    } keys %Entity2char ;

%Char2entity = ( 
    %Char2entity,
    '@Char oe'              => 'oe',
    '@Char OE'              => 'OE',
    '@Char fi'              => 'fi',
    '@Char fl'              => 'fl',
    '@Char florin'          => 'f',
    '@Char dagger'          => '+',
    '@Char quotesinglbase'  => ',',
    '@Char quotedblbase'    => '"',
    '@Char quotedblleft'    => '"',
    '@Char quotedblright'   => '"',
    '@Char quoteright'      => "'",
    '@Char endash'          => '-',
    '@Char emdash'          => '--',
    '@Char daggerdbl'       => '++',
    '@Char ellipsis'        => '...',
    '@Char fraction'        => '/',
    '@Char backslash'       => '\\',
    ) ;

my %Id2entity = map { 
        $Entity2char{$_} !~ /^\@Char/o ? ( $Entity2char{$_}, "&" . "$_;" ) : () ; 
    } keys %Entity2char ;

undef %Entity2char ;

my $Colour  = join "|", qw( 
                darkred     red     lightred
                darkgreen   green   lightgreen
                darkblue    blue    lightblue
                darkcyan    cyan    lightcyan
                darkmagenta magenta lightmagenta
                darkyellow  yellow  lightyellow
                darkgrey    grey    lightgrey
                darkgray    gray    lightgray
                black white
                ) ;

my $Font    = join "|", qw(
                AvantGarde Bookman Chancery Courier Helvetica Schoolbook
                Palatino Symbol Times Dingbats
                ) ;

my $Family  = join "|", qw( 
                Base Black BlackOblique Bold BoldItalic BoldItalicOsF
                BoldObl BoldOblique BoldOsF BoldSC BoldSlope Book
                BookOblique Compressed Cond CondBlack CondBlackObl
                CondBold CondBoldObl CondBook CondDemi CondLight
                CondLightObl CondMedium CondOblique Demi DemiItalic
                DemiOblique ExtraBold ExtraCompressed ExtraLight
                ExtraLightObl Italic ItalicOsF Light LightItalic
                Medium MediumItalic MediumObl Narrow NarrowBold
                NarrowBoldObl NarrowObl Oblique Roman RomanSC SC
                Semibold SemiboldItalic Slope UltraCompressed
                ) ;

my $Arg     = join "|", qw(
                type style labelwidth indent rightindent gap start
                margin linewidth paint
                ) ;

my $AtArg   = join "|", qw(
                Location Label RunningTitle
                InitialFont InitialBreak InitialSpace InitialLanguage
                PageOrientation PageHeaders FirstPageNumber ColumnNumber
                OptimizePages Unpaginated
                ContentsGoesHere
                CoverSheet ContentsSeparate AbstractDisplay
                ) ;

my %Id ;


# These must be in the symbol table.
use vars qw( $Time $Date $Sec $Min $Hour $Mday $Month $Year ) ;


# main

&prepare ;

local $_ ; 

{
    while( <DATA> ) {
        last if /^$/ ;
        chomp ;
        my( $k, $v ) = split ;
        $v = '' unless defined $v ;
        $Id{$k} = $v ;
    }    
    
    ( $Sec, $Min, $Hour, $Mday, $Month, $Year ) = (localtime( time ))[0..5] ;
    $Year += 1900 ; $Month++ ; 
    $Month = "0$Month" if $Month < 10 ; 
    $Mday  = "0$Mday"  if $Mday  < 10 ;
    $Date  = "$Year/$Month/$Mday" ;
    $Time  = "$Hour:$Min:$Sec" ;

    $/ = undef ; # Slurp mode.
    $_ = <> ;
}

&head if $Wrapper ;
&parse ;
&tail if $Wrapper ;

print STDERR "\n" if $Verbose ;


# subroutines.

sub head {

    print <<__EOT__ ;
<HTML>
<HEAD>
<TITLE>$ARGV</TITLE>
<!-- Generated by lout2html on $Date at $Time -->
</HEAD>
<BODY>
__EOT__
}


sub tail {
    
    print <<__EOT__ ;
</BODY>
</HTML>
__EOT__
}


sub parse {
    
    my @list    = () ;
    my @pending = () ;
    my $brace   = 0 ;
    my $heading = '1' ;
    my $i       = 0 ;

    print STDERR "Processing chunk: " if $Verbose ;

    $_ = &esc_lout( $_ ) ;

    while( my( $token, $value ) = &getok ) {
        printf STDERR "%08d\b\b\b\b\b\b\b\b", $i if $Verbose and $i % 16 == 0 ;
        $i++ ;

        CASE : {
            if( $token eq $CHAR ) {
                $value = $Char2entity{"\@Char $value"} || "\n\@Char $value\n" ;
                print $value ;
                last CASE ;
            }
            if( $token eq $ID ) {
                ID : {
                    if( exists $Id2entity{$value} ) {
                        print $Id2entity{$value} ;
                        last ID ;
                    }
                    if( exists $Id{$value} ) {
                        # No action taken unless simple or variable replacement.
                        if( $Id{$value} ) {
                            my $value = $Id{$value} ;
                            no strict 'refs' ;
                            $value = ${$1} if $value =~ /^\$(\w+)/o ;
                            print $value ;
                        }
                        last ID ;
                    }
                    if( $value eq '@FullWidthRule' or
                        $value eq '@NP'
                        ) {
                        print "\n<HR>\n" ;
                        last ID ;
                    }
                    if( $value =~ /^\@[LDP]P$/o ) {
                        print "\n<P>\n" ;
                        last ID ;
                    }
                    if( $value eq '@LLP' ) {
                        print "\n<BR>\n" ;
                        last ID ;
                    }
                    if( $value eq '@Chapter'  or
                        $value eq '@Report'   or
                        $value eq '@Appendix'
                        ) {
                        $heading = '1' ;
                        last ID ;
                    }
                    if( $value eq '@Section' or
                        $value eq '@SubAppendix'
                        ) {
                        $heading = '2' ;
                        last ID ;
                    }
                    if( $value eq '@SubSection'     or
                        $value eq '@SubSubAppendix'
                        ) {
                        $heading = '3' ;
                        last ID ;
                    }
                    if( $value eq '@SubSubSection' or
                        $value eq '@Theorem'
                        ) {
                        $heading = '4' ;
                        last ID ;
                    }
                    if( $value eq '@Title' ) {
                        print "\n<H$heading>" ;
                        push @pending, "</H$heading>\n" ;
                        $heading = '1' ;
                        last ID ;
                    }
                    if( $value eq '@Heading' or
                        $value eq '@Caption'
                        ) {
                        print "\n<H3>" ;
                        push @pending, "</H3>\n" ;
                        last ID ;
                    }
                    if( $value eq '@Verbatim' ) {
                        print "\n<PRE>" ;
                        push @pending, "\n</PRE>\n" ;
                        last ID ;
                    }
                    if( $value =~ /\@(?:Foot|Left|Right)Note/o ) {
                        print "\n[<I>Note:</I> " ;
                        push @pending, "]\n" ;
                        last ID ;
                    }
                    if( $value eq '@Tag' ) {
                        print "\n[<I>Tag:</I> " ;
                        push @pending, "]\n" ;
                        last ID ;
                    }
                    if( $value eq '@CD' or 
                        $value eq '@CentredDisplay' ) {
                        print "\n<CENTER>\n" ;
                        push @pending, "\n</CENTER>\n" ;
                        last ID ;
                    }
                    if( $value =~ /^\@Su([bp])$/o ) {
                        print "\n<SU\U$1>" ;
                        push @pending, "</SU\U$1>\n" ;
                        last ID ;
                    }
                    if( $value eq '@SysInclude' ) {
                        print "<!-- IGNORED \@SysInclude { $1 } -->\n" 
                        if s/\s*?{\s*?([\S]+?)\s*?}//os ;
                        last ID ;
                    }
                    if( $value eq '@Include' ) {
                        print qq{<A HREF="$1">$1</A>\n}
                        if s/\s*?{\s*?([\S]+?)\s*?}//os ;
                        last ID ;
                    }
                    if( $value =~ /\@(BI|[BFI]|Underline|Code)$/o ) {
                        my $type = $1 ;
                        my $nl = '' ;
                        $type = 'U'    if $type eq 'Underline' ;
                        if( $type eq 'F' or $type eq 'Code' ) {
                            $type = 'TT' ;
                            # Crude but seems to work-ish.
                            $type = 'PRE', $nl = "\n" if index( $_, '}' ) > 32 ; 
                        }
                        $type = 'B><I' if $type eq 'BI' ;
                        print "$nl<$type>" ;
                        print "\n" if $type eq 'PRE' ;
                        $type = 'I></B' if $type eq 'B><I' ;
                        push @pending, "$nl</$type>" ;
                        ( $token, $value ) = &getok ;
                        if( $token eq $TEXT ) {
                            $value = &unesc_lout( $value ) ;
                            # Aargh! I'm using Perl 5.004 which doesn't have
                            # negative look behind assertions! Should be:
                            # if( $value =~ /(?<!=<A)\s(?!>)/osx ) {
                            #     $value =~ s/(?<!=<A)(\s)(?!>)/<\/$type>$1/osx ;
                            if( $value =~ /[^<][^A]\s[^>]/os ) { #/
                                $value =~ s/([^<][^A])\s([^>])/$1$nl<\/$type> $2/os ;
                            }
                            else {
                                $value .= "$nl</$type>" ;
                            }
                            pop @pending ;
                            print $value ;
                        }
                        else {
                            redo CASE ;
                        }
                        last ID ;
                    }
                    if( $value =~ /^\@(?:Drop)?ListItem/o or 
                        $value =~ /^\@(?:Drop)?TagItem/o  or
                        $value eq '@LI' #/
                        ) {
                        print "\n<LI>" ;
                        last ID ;
                    }
                    if( $value =~ /
                        \@(?:Raw)?(?:Paren)?(?:UC)?
                        (?:Numbered|Roman|Alpha|Bullet|Star|Dash)?
                        List
                        /ox ) {
                        my $list = 'OL' ;
                        $list = 'UL' if $value =~ /(?:Bullet|Star|Dash)/o ;
                        my $type = '1' ;
                        if( $value =~ /Roman/o ) {
                            $type = 'i' ;
                        }
                        elsif( $value =~ /Alpha/o ) {
                            $type = 'a' ;
                        }
                        $type = uc $type if $value =~ /UC/o ;
                        $type = $list eq 'UL' ? '' : qq{ TYPE="$type"} ;
                        print qq{\n<$list$type>\n} ;
                        push @list, "\n</$list>\n" ;
                        last ID ;
                    }
                    if( $value =~ /TaggedList/o ) {
                        print "\n<UL>\n" ;
                        push @list, "\n</UL>\n" ;
                        last ID ;
                    }
                    if( $value =~ /^\@(?:Raw)?EndList/o ) {
                        print pop @list if scalar @list ;
                        last ID ;
                    }
                    DEFAULT : {
                        print "\n$value\n";
                        last ID ;
                    }
                }
                last CASE ;
            }
            if( $token eq $LBRACE ) {
                $brace++ ;
                last CASE ;
            }
            if( $token eq $RBRACE ) {
                # We need these strange comments in case we pop up in the
                # middle of a comment.
                print "<!-- -->", pop @pending, "<!-- -->" 
                while scalar @pending and $brace-- ; 
                last CASE ;
            }
            if( $token eq $TEXT ) {
                print &unesc_lout( $value ) ;
                last CASE ;
            }
            DEFAULT : {
                # Should never happen...
                warn "Unrecognised parse token $token\n" ;
                last CASE ;
            }
        }
        last unless length ;
    }
    print @list    if scalar @list ;
    print @pending if scalar @pending ;
}


sub getok {

    my( $token, $value ) = ( undef, '' ) ;

    if( s/^\@Char\s+([A-Za-z]+)\s*//os ) {
        $value = $1 ;
        $token = $CHAR ;    
    }
    elsif( s/^(\@[\@A-Za-z_]+)\s*//os ) {
        $value = $1 ;
        $token = $ID ;    
    }
    elsif( s/^${ESC}(cragged|clines)//os ) {
        $value = '@CD' ;
        $token = $ID ;    
    }
    elsif( s/^\s*{\s*//os ) {
        $token = $LBRACE ;
    }
    elsif( s/^}\s*//os ) {
        $token = $RBRACE ;
    }
    else {
        s/^([^\@{}]+)//os ;
        $value = $1 ;
        $token = $TEXT ;
    }

    ( $token, $value ) ;
}


sub esc_lout {

    local $_ = shift ;

    # Delete stuff we can't or don't want to handle.
    s/\b(nohyphen)\b//gos ;

    # Fix some raw lout.
    # Again we really need negative look ahead and negative look behind...
    s,//(\d+(?:\.\d+)?(?:[pf])?)(?!\"),{\@LP}$1,gos ; #"
    s/\|(\d+(?:\.\d+)?(?:[pf])?)(?!\")/{\@LP}$1/gos ; 

    # Comment out or delete args.
    s/^#(.*)$/$ESC$X$1$ESC$Y/gom ;
    s/([^"])#(.*)$/$1$ESC$X$2$ESC$Y/gom ;
    s{\{\s*(adjust|outdent|[ro]?ragged)\s*
      (?:\d+(?:\.\d+)?[pf])?\s*(?:(?:no)?hyphen)?\s*\}}
     {$ESC$X$1$ESC$Y}gosx ;
    s/\b((?:$Arg)\s*\{[^}]+\})/$ESC$X$1$ESC$Y/gos ;
    s/\@((?:$AtArg)\s*\{[^}]+\})/$ESC$X$1$ESC$Y/gos ;
    s/\@(Place\s*x\s*{[^}]+\}\s*y\s*{[^}]+\})/$ESC$X$1$ESC$Y/gos ;
    s/((?:$Colour)\s+)\@(Colou?r)/$ESC$X$1$2$ESC$Y/gos ;
    s/(\{\s*(?:$Font)?\s*(?:$Family)?\s*(?:[+]?\d+(?:\.\d+)?[pf])?\s*\})/
      $ESC$X$1$ESC$Y/gosx ;
    
    # Escape lout escapes.
    s/"(\S*)\@(\S*)"/$1$ESC$AT$2/gos ;
    s/"\\\\"/$ESC$BSLASH/gos ;
    s!"/"!$ESC$FSLASH!gos ;
    s/"\\""/$ESC$DQUOTE/gos ;
    s/"\|"/$ESC$BAR/gos ;
    s/"#"/$ESC$HASH/gos ;
    s/"&"/$ESC$AND/gos ;
    s/"{"/$ESC$LBRACE/gos ;
    s/"}"/$ESC$RBRACE/gos ;
    s/"^"/$ESC$HAT/gos ;
    s/"~"/$ESC$TILDE/gos ;

    # Make some items psuedo identifiers to be picked up in getok.
    s/\b(cragged|clines)\b/$ESC$1/gos ;
   
    $_ ;
}


sub unesc_lout {
    local $_ = shift ;

    # Unescape the escaped lout.
    s/$ESC$AT/\@/gos ;
    s!$ESC$BSLASH!\\!gos ;
    s!$ESC$FSLASH!/!gos ;
    s/$ESC$DQUOTE/"/gos ;
    s/$ESC$BAR/|/gos ;
    s/$ESC$HASH/#/gos ;
    s/$ESC$AND/&/gos ;
    s/$ESC$LBRACE/{/gos ;
    s/$ESC$RBRACE/}/gos ;
    s/$ESC$HAT/^/gos ;
    s/$ESC$TILDE/~/gos ;

    # Convert to HTML entities where necessary.
    $_ = encode_entities( $_ ) ;
    s/~/\&nbsp;/gos ;

    # Comment commented out bits.
    s/&#0;&oslash;/<!-- /gos ;
    s/&#0;&ugrave;/ --> /gos ;
    
    # Guess email addresses and web links.
    s{((?:(?:http|ftp):\/\/|www\.)[-A-Za-z0-9._\/]*[A-Za-z0-9\/](?:\.html?)?)}
     {<A HREF="$1">$1<\/A>}gosi ;
    s/([-A-Za-z0-9._\/]+\@[-A-Za-z0-9._\/]+)/<A HREF="mailto:$1">$1<\/A>/gosi ;

    # Guess images.
    s/([-A-Za-z0-9._\/]+\.eps(?:\.(?:gz|z|Z))?)/[Image "$1"]/gosi ;

    $_ ;
}


sub prepare {

    # Change these in &help if changed here.
    my $outfile = 'STDOUT' ;

    Getopt::Long::config 'no_ignore_case' ;
    GetOptions(
        'h|help'        => \&help,
        'o|outfile=s'   => \$outfile,
        'v|verbose=i'   => \$Verbose,
        'w|wrapper=i'   => \$Wrapper,
        ) or die "\n" ;

    if( $outfile ne 'STDOUT' ) {
        open OUT, ">$outfile" or die "Failed to open $outfile: $!\n" ;
        select( OUT ) ;
    }
}


sub help {
    print <<__EOT__ ;
lout2html v $VERSION. Copyright (c) Mark Summerfield 1999. All rights reserved.
This is free software you can use/modify it under the same terms as perl.

usage: lout2html [options] <files>

options:
-h    --help            Print this help screen and exit
-o F  --outfile F       Write to file F [STDOUT]
-v    --verbose         Verbose (so you can see it work albeit slowly!) [$Verbose]
-w    --wrapper         Wrap text in <HTML><HEAD></HEAD><BODY>... etc [$Wrapper]

This only works on a subset of lout. Tables are not supported and may never
be. Diagrams and graphics will probably never be supported. Macros and
definitions are not handled, mistakes are made - however if you've got lots of
text and lists this may give you enough of what you need.
__EOT__
    exit ;
}
 

# NB The 'variables' that appear below aren't variables at all - they are just
# strings that look like variables. However, later we will use 'no strict
# refs' to let us get the values of the variables that bear their names.
__END__
@BackEnd
@Begin
@BeginSections
@BeginSubSections
@Box
@Break
@Case
@CNP
@CurveBox
@Dagger     +
@DaggerDbl  ++
@Date
@DateLine   $Date
@Day        $Mday
@DayNum     $Mday
@Display
@Doc
@Document
@DropCapTwo
@DropCapThree
@End
@Florin     f
@Font
@Format
@FullWidth
@High
@Hour       $Hour
@ID
@IncludeGraphic
@IndentedDisplay
@Index
@IndexA
@IndexB
@Language
@LD
@LeftDisplay
@Lozenge    &lt;&gt; 
@MeriDiem
@Minute     $Min
@MonthNum   $Month
@Month      $Month
@Next
@NumberOf
@OneCol
@OneRow
@S
@Scale
@Second     $Sec 
@ShadowBox
@ShortDay   $Mday
@ShortHour  $Hour
@ShortMeriDiem
@ShortMonth $Month
@ShortYear  $Year
@ShowMarks
@Star       *
@Text
@Time       $Time
@TwelveHour $Hour
@VShift
@Wide
@Year       $Year
@Yield

=pod

=head1 NAME

lout2html

=head1 DESCRIPTION

Converts a subset of Lout to HTML -- it does a very basic job which needs hand
correction - only for the desperate!

=head1 BUGS

If identifiers don't start with @ they will not be detected.

Does not cope with nested braces properly.

Ignores definitions and macros.

Its slow.

It doesn't cope with all the symbols.

It does not do tables, diagrams, graphics, etc - and may never do them - see below.

B<It is I<not> a true parser - so it will I<never> get it right.> There is no
BNF for Lout which means that making a proper parser is `non-trivial'. However,
C<lout2html> may be sufficient for your needs - at least to do the bulk of the 
conversion work.


=head1 README

Converts a subset of Lout to HTML -- it does a very basic job which needs hand
correction - only for the desperate!

=head1 PREREQUISITES

C<strict> 

C<integer>

C<Getopt::Long>
C<Lout>
C<HTML::Entities>

=head1 COREQUISITES

=head1 COPYRIGHT

Copyright (c) Mark Summerfield 1999. All Rights Reserved.
May be used/distributed under the GPL.
Email <summer@chest.ac.uk> with 'lout2html' in the subject line.

=head1 OSNAMES

any

=head1 SCRIPT CATEGORIES

Lout
Text-processing

=cut

