#! /bin/sh
# Gateways articles from mailing lists to netnews.  Feed the mail into
# this, and send the output into relaynews or newsspool for C News,
# presumably rnews for B news.
#
# NB! Later versions of this file will be in C News.
#
# We ensure that RFC1036 headers exist else we add them.
# (Path, Message-ID, From, Date, Newsgroups, Subject)
# Mask To, Cc, Newsgroups, Resent- with Original- to avoid potential
# infinite loops.  Perhaps we should delete them?
# Throw away Received, Bcc, Fcc, In-Reply-To, Illegal-*, X-Mailer
# Add our own Path, Sender, Approved
# Correct headers of the form Text:\n by adding space after :
# Rewrite From: into address (comments) instead of phrase <route>
# and qualify it. (!! should do a better job)
#
# Mark Moraes, University of Toronto

### Configuration variables

host=cs.toronto.edu		# this gets put in Path:, and will therefore
				# not receive the article.  Make sure it's
				# not the name of a news host.

fakeuser=nevdull		# user for Path: and From: if all else fails.
				# Should preferably be an alias for /dev/null

admin=list-admin		# for the Sender: line.  Should point at the
				# person that runs the gateway.

### End of configuration.  You shouldn't need to change stuff after this.
case $# in
1)	newsgroups=$1; shift;;
*)	echo "Usage: $0 newsgroup" >&2; exit 1;
esac

SHELL=/bin/sh; export SHELL	# just in case system() runs $SHELL, not sh.
path=$host!$fakeuser		
sender=$admin@$host
approved=$newsgroups@mail.$host
tee /tmp/m/save$$ |
nawk 'function extract(s, startchar, endchar, tmp1, tmp2, tmp) {
	# s, startchar, endchar are arguments, rest are locals.
	# This returns whatever is in s between startchar and endchar (not
	# inclusive).
	tmp1 = index(s, startchar);
	if (tmp1 != 0) {
		tmp = substr(s, tmp1 + 1, length(s) - tmp1);
		tmp2 = index(tmp, endchar);
		if (tmp2 != 0)
			tmp = substr(tmp, 1, tmp2 - 1);
	} else
		tmp = "";
	return tmp;
}
function strip(s, tmp) {
	# s is an argument, rest are locals.
	# strips off leading and trailing whitespace.
	tmp = s;
	if (match(tmp, /^[ \t]+/) != 0)
		tmp = substr(tmp, RLENGTH + 1, length(tmp) - RLENGTH);
	if (match(tmp, /[ \t]+$/) != 0)
		tmp = substr(tmp, 1, RSTART - 1);
	return tmp;
}
function upto(s, endchar, tmp1, tmp) {
	# s, endchar are arguments, rest are locals.
	# This returns the string upto endchar, not including endchar,
	# stripping off any leading and trailing whitespace.
	tmp1 = index(s, endchar);
	if (tmp1 == 0)
		tmp = "";
	else
		tmp = substr(s, 1, tmp1 - 1);
	return tmp;
}
function addrmunge(s, tmp1, tmp2, addr) {
	# s is an argument, rest are locals
	# tries to rewrite an address from the RFC822 "phrase <address>" form
	# to RFC1036 "address (phrase)" form.  If address has no @ or !,
	# assume it is local and add @$host to it.  Yuck!
	# We do not pretend to handle pathological (in the psychiatric sense)
	# cases with imbalanced or nested parenthesis, angle brackets
	# or quoting. Why doesnt awk sub() have \(\)?
	if (s ~ /.*[ \t]*<.*>/) {
		addr = extract(s, "<", ">");
		phrase = upto(s, "<");
	} else if (s ~ /.*\(.*\)/) {
		phrase = extract(s, "(", ")");
		addr = upto(s, "(");
	} else {
		addr = s;
		phrase = "";
	}
	addr = strip(addr, " ");
	phrase = strip(phrase, " ");
	if (addr !~ /.*@.*/ && addr !~ /.*!.*/)
		addr = addr "@'"$host"'";
	if (phrase ~ /[^ \t]+/)
		addr = sprintf("%s (%s)", addr, phrase)
	return addr;
}
BEGIN {
	inbody = "";
	ignore = "";
	print "Path: '"$path"'";
	print "Newsgroups: '"$newsgroups"'";
}
inbody == "y" {
	print;
	next;
}
inbody == "" && $0 ~ /^$/ {
	inbody = "y";
	if (saw["from"] == "") {
		if (replyto != "")
			addr=replyto;
		else if (from != "")
			addr=from;
		else if (rpath != "")
			addr=rpath;
		else
			addr="'"$sender"'";
		print "From:", addrmunge(addr);
	}

	if (saw["date"] == "") {
		# stolen from C News anne.jones
		gendate="set `date`; echo $1, $3 $2 $6 $4 $5"
		printf "Date: ";
		system(gendate);
	}
	if (saw["message-id"] == "") {
		# stolen from C News anne.jones
		genmsgid="set `date`; echo \"\<$6$2$3.` echo $4 | tr -d : `.$$@'"$host"'\>\"";
		printf "Message-ID: ";
		system(genmsgid);
	}
	if (saw["subject"] == "")
		print "Subject: (none)";
	print "Sender: '"$sender"'"
	print "Approved: '"$approved"'"
	print;
	next;
}
ignore == "y" && $0 ~ /^[ \t]/ {
	next;
}
/^[ \t]/ {
	print;
	next;
}
/^[^ \t]*:/ {
	s = $0;
	ignore = "";
	if (s !~ /^[^ \t]*: /) # C news insists on a space after :
		sub(":", ": ", s);
	if (s ~ /^[xX]-[mM][aA][iI][lL][eE][rR]:/ || \
	    s ~ /^[iI][lL][lL][eE][gG][aA][lL]-.*:/ || \
	    s ~ /^[rR][eE][cC][eE][iI][vV][eE][dD]:/ || \
	    s ~ /^[bB][cC][cC]:/ || s ~ /^[fF][cC][cC]:/ || \
	    s ~ /^[iI][nN]-[rR][eE][pP][lL][yY]-[tT][oO]:/) {
		ignore = "y";
		next;
	}
	if (s ~ /^[pP][aA][tT][hH]:/ || \
	    s ~ /^[aA][pP][pP][rR][oO][vV][eE][dD]:/ || \
	    s ~ /^[sS][eE][nN][dD][eE][rR]:/ || \
	    s ~ /^[nN][eE][wW][sS][gG][rR][oO][uU][pP][sS]:/ || \
	    s ~ /[tT][oO]:/ || s ~ /[cC][cC]:/ || \
	    s ~ /^[rR][eE][sS][eE][nN][tT]-.*:/) {
		printf("Original-%s\n", s);
		next;
	}
	if (s ~ /^[fF][rR][oO][mM]:/) {
		saw["from"] = s;
		sub(/^[fF][rR][oO][mM]:/, "", s);
		print "From:", addrmunge(s);
		next;
	}
	if (s ~ /^[dD][aA][tT][eE]:/) {
		saw["date"] = s;
		sub(/^[dD][aA][tT][eE]:/, "Date:", s);
#		s = checkdate(s);
	}
	if (s ~ /^[mM][eE][sS][sS][aA][gG][eE]-[iI][dD]:/) {
		saw["message-id"] = s;
		sub(/^[mM][eE][sS][sS][aA][gG][eE]-[iI][dD]:/,"Message-ID:",s);
#		s = checkmsgid(s);
	}
	if (s ~ /^[rR][eE][pP][lL][yY]-[tT][oO]:/) {
		replyto = s;
		sub(/^[rR][eE][pP][lL][yY]-[tT][oO]:/, "", replyto);
	}
	if (s ~ /^[rR][eE][tT][uU][rR][nN]-[pP][aA][tT][hH]:/) {
		rpath = s;
		sub(/^[rR][eE][tT][uU][rR][nN]-[pP][aA][tT][hH]:/, "", rpath);
	}
	if (s ~ /^[sS][uU][bB][jJ][eE][cC][tT]:/) {
		saw["subject"] = s;
		sub(/^[sS][uU][bB][jJ][eE][cC][tT]:/, "Subject:", s);
	}
	print s;
	next;
}
/^From / {
	from = $2;
	next;
}
{
	printf("non header line in header: %s\n", $0) # | "cat >&2"
}
END {
	if (inbody == "") 	# may as well stick a blank line on
		print "";
}' | tee /tmp/m/out$$
