#!/opt/local/bin/gawk -f
#
# bbl2html.awk v1.2c
#
#	Released to the public domain (ie. use at your own risk)
#	Rik Blok <rikblok@mail.com>
#	December 13, 2000.
#
#	Latest version available from 
#			http://rikblok.cjb.net/scripts/bbl2html.awk
#
#	Converts a LaTeX .bbl file to (mostly) formatted html code.  Probably
#	also works if applied directly to a .tex file.  Sets bookmarks 
#	to the keys so you can reference a citation from another page, eg. 
#	<a href="bib.html#key">[1]</a> will make a link to "key".
#
#	bbl2html will use the default label unless you set (on the command-line)
#	override=key or override=number in which case it will use the citation 
#	key or numeric format, respectively.
#
#	I wrote this script out of dissatisfaction with other conversion tools
#	available.  Hopefully it will be of use to somebody.  Feel free to 
#	modify the script to suit you.  I've tested it with the bibliography
#	styles abbrv, alpha, apalike, ieeetr, plain, prsty, siam, and unsrt 
#	and it works fairly well.
#	For a sample of the output visit http://rikblok.cjb.net/lib/refs.html.
#
#	Usage:
#		awk -f bbl2html.awk head=<header> foot=<footer> \
#			[override = key|number] [labelwidth=<width>] 
#			[bigtable = 0|1] [noabout=0|1] <infile> > <outfile>
#	
#	where
#		<header> and <footer> may be formatted text (enclosed in escaped
#	quotes if containing a space) to be placed at the beginning and end
#	of the output, respectively.  As a special case, if either begins with
#	the symbol "@" it is assumed to be a filename and the text is read from
#	the file specified.  If neither a header nor a footer is specified
#	<html><body> and </body></html> are used, respectively;
#		override is an optional variable to change the displayed label to
#	the citation key or numeric format;
#		<width> is the width of the column label, in pixels or, if appended
#		by "%", in percent (optional, defaults to 50 (pixels));
#		bigtable is an optional variable which allows the page to be
#	formatted as one big table (=1) or a separate table for each entry (=0)
#	(multiple tables can be displayed incrementally as the page loads but 
#	a single table will be rendered faster.  Optional, defaults to 0);
#		noabout is an optional variable which tells bbl2html.awk not to 
#	print the "Generated by bbl2html.awk..." comment at the bottom of the
#	page (optional, defaults to 0);
#		<infile> is the name of the bibliography file (or LaTeX file?) to 
#	be converted; and
#		<outfile> is the name of the html file to be generated.
#
#	Sample usages:
#		awk -f bbl2html.awk head=\<html\>\<body\> foot=\</body\>\</html\> bib.bbl > bib.html
#		awk -f bbl2html.awk head=@bib.head foot=@bib.foot override=key bib.bbl > bib.html
#		awk -f bbl2html.awk -f myOwnSubstitutions.awk bib.bbl > bib.html
#		awk -f bbl2html.awk labelwidth=20% bigtable=1 bib.bbl > bib.html
#
#	Notes: 
#	
#	1) You can add your own substitutions fairly easily by setting
#		userfind[] and userreplace[] in the BEGIN action.  You can also
#		place the substitutions in a separate awk file (within a BEGIN
#		action) to avoid modifying this script (see the last example above
#		for a sample usage).
#
#	2) To generate a complete list of citations from a bibtex file 
#		myreferences.bib use the bbl file generated by this latex file:
#
#		%%%% begin latex file
#		\documentclass{article}
#		\usepackage{url}	% if your citations have any \url{} commands
#		\begin{document}
#		\nocite{*}
#		\bibliographystyle{unsrt}	% use whichever style you prefer
#		\bibliography{myreferences}	% use myreferences.bib
#		\end{document}
#		%%%% end latex file
#
#	3) bbl2html.awk needs GNU awk/gawk.  On Solaris machines, neither
#		/usr/bin/awk nor /usr/bin/nawk work.
#
#	4) If you download this file make sure it has the proper line-endings
#		for your filesystem.  Otherwise running the script will probably
#		generate an "^ Invalid char" error.  On Unix, process the script
#		with dos2unix if you encounter this error.
#
#	5) Any occurrences of "<" and ">" must be escaped in header
#		and footer.  Eg. head=\<html\>\<body\> foot=\</body\>\<html\>
#		(not needed in files header/footer may point to).
#
#	Revisions:
#	v1.2c	December 13, 2000
#		- added Unix shell header (!/opt/local/bin/gawk -f) to run script
#		as a shell command.  The path may need to be modified on your 
#		machine.
#		- labelwidth now defaults to pixels.  Append a percent symbol to
#		use as percent (eg. "labelwidth=20%").
#		- more linebreaks allowed in urls
#		- fixed: end-of-line comments ("%\n") left in txt of \href{url}{txt}
#		- adds "Generated by bbl2html.awk..." comment at bottom of file
#		(can be disabled with "noabout=1" command-line parameter)
#	v1.2b	December 11, 2000
#		- fixed: chokes on \href{url}%\n{text}
#		- basic math support (italics, super- and sub-scripts)
#		- now has bookmarks (<a name="...">) for both keys and labels 
#		  (if different)
#	v1.2	December 7, 2000
#		- basic support for \href{url}{txt} (tries to guess how to format 
#		  'txt', either as text or as an url)
#		- if neither header nor footer specified, defaults to
#		  head="<html><body>" and foot="</body></html>"
#	v1.1	November 5, 2000
#		- added labelwidth option (percentage)
#		- rudimentary support for smallcaps
#		- added notes 4 and 5 
#	v1.0c	August 25, 2000
#		- replaced userfind[]/userreplace[] indices with descriptive keys
#	v1.0b	July 31, 2000
#		- replaceFormat() checks for multiple occurrences of formatting
#		- now handles more general keys
#		- supports most bibliography styles (that I know of)
#		- defaults to using whatever labels are supplied, or numeric (if none)
#		- can override label with override=key or override=number
#	v1.0	July 28, 2000
#		- initial release
#
#	To do:
#	- nothing urgent
#
#	Thanks to:
#	- Marc Mutz for bug-hunting and the math substitutions
#--------------------------------------------------------------------------
BEGIN {
	# bbl2html.awk information
	version = "1.2c";		
	home = "http://rikblok.cjb.net/scripts/index.html#bbl2html.awk";

	# put user-defined substitutions here
	# arXiv.org preprint archive
	userfind["arxiv"]    = "arXiv:([a-zA-Z\.\-]+\/[0-9][0-9][0-9][0-9][0-9][0-9][0-9])";
	userreplace["arxiv"] = "<a href=\"http://arXiv.org/abs/\\1\"><tt>arXiv:\\1</tt></a>";
	# siam style uses a horizontal line in place of repeating author names
	userfind["siam1"]    = "\\\\leavevmode\\\\vrule height 2pt depth -1.6pt width 23pt";
	userreplace["siam1"] = "<strike>\\&nbsp;\\&nbsp;\\&nbsp;\\&nbsp;\\&nbsp;\\&nbsp;\\&nbsp;\\&nbsp;</strike>";
	# some trivial math
	userfind["math"]    = "([^\\\\])\\$([^\\$]*)\\$";
	userreplace["math"] = "\\1<var>\\2</var>";
	# for super and subscripts:
	userfind["math.sub"]    = "([^\\\\])(\\$|<var>)(.*)_([a-zA-Z0-9]|{.*}|\\\\[a-zA-Z]+)([^\\$]*)(\\$|</var>)";
	userreplace["math.sub"] = "\\1<var>\\3<sub>\\4</sub>\\5</var>";
	userfind["math.sup"]    = "([^\\\\])(\\$|<var>)(.*)\\^([a-zA-Z0-9]|{.*}|\\\\[a-zA-Z]+)([^\\$]*)(\\$|</var>)";
	userreplace["math.sup"] = "\\1<var>\\3<sup>\\4</sup>\\5</var>";

	# strip out everything before "\begin{thebibliography}"
	while (line !~/\\begin{thebibliography}/)
		getline line;
		
	NR=0;

	RS = "\\\\bibitem";			# record separator = "\bibitem"
	FS = "\\\\newblock[ \n]+";	# field separator = "\newblock "
}

NR == 1 {		# on begin, after reading command-line parameters
	if (!head && !foot)	{	# if both undefined then use defaults
		head = "<html><body>";
		foot = "</body></html>";
	}

	# put in header
	printhf(head);
	
	# default labelwidth = 50 (pixels)
	if (!labelwidth)	labelwidth = 50;

	if (bigtable) print "<table width=\"100%\">";
}

# every record
{
	label = "";			# erase label
}

# leading "[", set label and strip "[...]" from $1
$1 ~ /^\[/ {					
	right = matchBrace($1,1);
	label = substr($1,2,right-2);

	# alpha style
	sub(/{\\etalchar{\+}}/,"+",label);	# replace "{\etalchar{+}}" --> "+"

	# apalike style
	label = authorFormat(label);
	
	# strip label from line
	$1 = substr($1,right+1);
}

# leading "{"
$1 ~ /^{/ {
	keycount++;
	# if label not already set or override then set to number
	if (!label || override=="number")	label = keycount;
	
	# get length of key from $1
	right = matchBrace($1,1);	

	# set bookmark to key
	key = substr($1,2,right-2);
	
	# use key as label?
	if (override=="key")	label = key;

	if (!bigtable) print "<table width=\"100%\">";
	print "<tr><td width=\"" labelwidth "\" valign=\"top\">";
	printf("<a name=\"" key "\">");
	print "[" label "]";
	if (key != label)	printf("<a name=\"" label "\">");
	printf("</td><td");
	# fixing width=100% looks better when using multiple tables
	if (!bigtable) printf(" width=\"100%\"");
	printf(">");

	# strip key out of first line
	line = substr($1,right+1);

	# process each line
	lineno=1;
	while (lineno<=NF) {
		# if last line then check for "\end{thebibliography}"
		if (lineno==NF)	sub(/\n\\end{thebibliography}/,"",line);
		
		# first take out any urls before any more processing
		while ((left=match(line,/\\url{/))>0) {
			right= matchBrace(line,RSTART+4);
			if (right>left) {
				urlcnt++;
				url[urlcnt] = substr(line,left+5,right-left-5);	# assumes just one url per line
				line = substr(line,1,left-1) "__URL" urlcnt "__" substr(line,right+1);
			}
		}
		# repeat for hrefs
		while ((left=match(line,/\\href{/))>0) {
			right= matchBrace(line,RSTART+5);
			if (right>left) {
				urlcnt++;
				url[urlcnt] = substr(line,left+6,right-left-6);	# assumes just one url per line
				# now find href text, starting with next '{'
				if ((left2 = match(substr(line,right+1),/{/))>0) {
					left2 += right;
					right = matchBrace(line,left2);
					txt[urlcnt] = substr(line,left2+1,right-left2-1);
				}
				# replace 
				line = substr(line,1,left-1) "__URL" urlcnt "__" substr(line,right+1);
			}
		}
		
		line = authorFormat(line);
		
		# re-insert formatted urls
		while (urlcnt) {
			url[urlcnt] = urlFormat(url[urlcnt],txt[urlcnt]);
			urlmark = "__URL" urlcnt "__";
			# can't use sub() because url[urlcnt] may contain "&"
			# sub(urlmark,url[urlcnt],line);
			if (match(line,urlmark)) {
				line = substr(line,1,RSTART-1) url[urlcnt] substr(line,RSTART+RLENGTH);
			}
			urlcnt--;
		}
		print line "<br>";
		
		# get ready for next line
		lineno++;
		line = $lineno;
	}
	
	printf("</td></tr>");
	if (!bigtable)	print "</table>";
}

END {
	if (bigtable)	print "</table>";
	if (!noabout) {
		print "<hr><font size=\"-1\"><address>Generated by <a href=\"" home "\">bbl2html.awk</a> v" version "</address></font>";
	}
	if (foot) printhf(foot);
}
#--------------------------------------------------------------------------
function authorFormat(	s,
						left,right)
#	substitute accents in author-type string s.
{
	# first replace small-caps formatting style so accents are handled properly
	s = replaceFormatSC(s);

	# next, replace accents
	s = gensub(/\\'([AEIOUYaeiouy])/, "\\&\\1acute;", "g", s);
	s = gensub(/\\`([AEIOUaeiou])/, "\\&\\1grave;", "g", s);
	s = gensub(/\\\^([AEIOUaeiou])/, "\\&\\1circ;", "g", s);
	s = gensub(/\\~([AEINOUaeinou])/, "\\&\\1tilde;", "g", s);
	s = gensub(/\\[\.]([AEIOUaeiou])/, "\\&\\1ring;", "g", s);
	s = gensub(/\\\"([AEIOUaeiou])/, "\\&\\1uml;", "g", s);
	s = gensub(/\\([Oo])/, "\\&\\1slash;", "g", s);
	s = gensub(/\\(AE|ae)/, "\\&\\1lig;", "g", s);
	gsub(/\\ss/,"\\&szlig;",s);	# German sharp s
	gsub(/~/,"\\&nbsp;",s);		# replace nonbreaking spaces: ~ --> &nbsp;
	gsub(/\\[,@]/," ",s);		# replace spaces
	gsub(/``|''/,"\\&quot;",s);	# replace quotes
	gsub(/---/,"-",s);			# replace dashes
	gsub(/--/,"-",s);
	
	# these accents can't be displayed in HTML (with my charset) so delete 'em
	s = gensub(/\\[bcduvH]{([a-zA-Z])}/, "\\1", "g", s);	# eg. \u{o}
	gsub(/{\\AA}/,"A",s);	# \AA --> A
	gsub(/{\\aa}/,"a",s);	# \aa --> a
	
	# user-defined substitutions
	for (i in userfind) {
		s = gensub(userfind[i], userreplace[i], "g", s);
	}
	
	# replace formatting styles
	s = replaceFormat(s,"\\em","<em>","</em>");	# replace emphasis
	s = replaceFormat(s,"\\bf","<b>","</b>");	# replace bold
	s = replaceFormat(s,"\\it","<i>","</i>");	# replace italics
	s = replaceFormat(s,"\\tt","<tt>","</tt>");	# replace teletype
	
	gsub(/{|}/,"",s);			# drop any remaining braces
	gsub(/\\/,"",s);			# drop any remaining slashes
	return s;
}
#--------------------------------------------------------------------------
function urlFormat(	url,		# function parameters
					display)	# optional variables
# Format a url.  If 'display' is passed then try to determine if it should
# be displayed formatted author-like or url-like.
{
#	gsub(/\&/,"\\\\&",url);	# escape "&"s (hmm, apparently I don't need this...)
	gsub(/ /,"",url);		# strip spaces
	gsub(/%\n/,"",url);		# strip end-of-line comments
	gsub(/\n/,"",url);		# strip other linebreaks
	gsub(/%\n/,"",display);	# also strip end-of-line comments in display


	# guess how to format display, either as an url or authorFormat()
	if (!display || tolower(display) ~ /:\/\/|^mailto:/) {
		if (!display)	display = url;	# default display = formatted url
		# allow linebreaks after punctuation symbols for display purposes
		display = "<tt>" gensub(/([^A-Za-z0-9 ])/,"\\1<wbr>","g",display) "</tt>";
	} else {
		display = authorFormat(display);
	}

	return "<a href=\"" url "\">" display "</a>";
}
#--------------------------------------------------------------------------
function printhf(	s,		# function parameters
					line)	# local variables
#	Prints s (head or foot).  If s starts with "@" then is assumed to 
#	be a filename and prints the contents of the file
{
	# if no leading "@" then just print s
	if (s !~ /^@/) {
		print s;
		return;
	}
	
	# else print contents of file s
	s = substr(s,2);	# drop leading "@"
	while ((getline line < s) > 0)
		print line;
	close(s);
}
#--------------------------------------------------------------------------
function replaceFormat(	s,find,replaceleft,replaceright,	# function parameters
						left, right)						# local variables
#	Replace formatting style marks. Use to change things like {\it et al.}
#	into {<it> et al.</it>} with the usage
#	s = replaceFormat(s,"\\it","<i>","</i>");
{
	while ((left = index(s,find))>0) {
		right = matchBrace(s,left,"{");		# find "}" which matches assumed "{" at position 'left'
		if (right>left) {
			s = substr(s,1,right-1) replaceright substr(s,right);
		}
		s = substr(s,1,left-1) replaceleft substr(s,left+length(find));
	}
	return s;
}
#--------------------------------------------------------------------------
function replaceFormatSC(	s,								# function parameters
							find,replaceleft,replaceright,	# local variables
							left, right,l,c,r,cout,i,capslock,ch)					
#	Replace small caps "{\sc ...}" formatting style marks.
#	Should be called before accents are replaced (so that "\'a" --> "\'A", 
#	for example, instead of "&aacute;" --> "&AACUTE;".)
#	This routine is not robust, it assumes only a small subset of LaTeX
#	commands (such as accents) will be found in the text s.  Unanticipated
#	commands will probably be changed to uppercase (but, for now, this can
#	probably be corrected with userfind[]/userreplace[] substitutions).
{
	find = "\\sc";
	replaceleft = "<font size=\"-1\">";
	replaceright= "</font>";
	while ((left = index(s,find))>0) {
		right = matchBrace(s,left,"{");		# find "}" which matches assumed "{" at position 'left'
		if (right<left)	right = length(s);
		# split s into parts
		l = substr(s,1,left-1);	# left
		left += length(find);
		c = substr(s,left,right-left+1)
		r = substr(s,right+1);
		# manipulate c
		capslock=1;	# start in uppercase
		cout = "";
		for (i=1; i<=length(c); i++) {
			ch = substr(c,i,1);
			if (ch ~ /[a-z]/) {
				if (capslock) {
					ch = replaceleft toupper(ch);
					capslock = 0;
				} else 
					ch = toupper(ch);
			} else if (ch ~ /[A-Z0-9]/) {
				if (!capslock) {
					ch = replaceright ch;
					capslock = 1;
				}
			}
			cout = cout ch;
		}
		if (!capslock) cout = cout replaceright;
		# correct mangled accents
		gsub(/\\B{/, "\\b{", cout);
		gsub(/\\C{/, "\\c{", cout);
		gsub(/\\D{/, "\\d{", cout);
		gsub(/\\U{/, "\\u{", cout);
		gsub(/\\V{/, "\\v{", cout);

		# recombine s
		s = l cout r;
	}
	return s;
}
#@include matchBrace.awk
#--------------------------------------------------------------------------
# matchBrace.awk - library containing single function matchBrace()

function matchBrace(	s,i,										# function parameters
						open,										# optional parameters
						brace,depth,left,right,either,pos,start)	# local variables
#	Finds the matching brace for the one at index i (or assume brace==open,
#	if specified) in string s.  Returns
#	index of matching brace or zero if not found.
{
	# error trap
	if (!i)	return 0;
	
	# if open not specified then read from substr(s,i,1)
	if (!open)	open = substr(s,i,1);
	
	# identify type of braces and put in left and right
	left = "([{<`)]}>'";
	right= ")]}>'([{<`";
	pos = index(left,open);
	if (!pos) {						# not in list of braces
		left = open;				# set left and right to the same thing
		right= left;
		either = "[" left "|" right "]";	# regexp (not escaped)
	} else {
		left = substr(left, pos,1);	# found in list, set match
		right= substr(right,pos,1);
		either = "[\\" left "|\\" right "]";	# regexp (escaped)
	}

	# find matching brace
	pos = i;
	depth = 1;
	while (depth) {
		start += pos;
		s = substr(s,pos+1);
		if ((pos = match(s,either))>0) {	# another brace found
			if (substr(s,pos,1)==right)	depth--;
			else						depth++;
			
		} else	return 0;					# no more braces, return zero
	}
	return start+pos;
}
#--------------------------------------------------------------------------
