/* html.c - simple HTML parsing	*/
/* by Reinier Post		*/

/*
 * $Log: html.c,v $
 * Revision 0.14  1994/05/17  13:40:14  reinpost
 * cache_prefix is now obtained from cleanup.c as it may depend on the URL
 * to be translated
 *
 * Revision 0.14  1994/05/17  13:40:14  reinpost
 * cache_prefix is now obtained from cleanup.c as it may depend on the URL
 * to be translated
 *
 * Revision 0.13  1994/05/03  19:31:38  reinpost
 * small change
 *
 * Revision 0.12  1994/04/01  15:07:40  reinpost
 * the code to translate an URL was separated out
 *
 * Revision 0.10  1994/03/11  13:40:18  reinpost
 * absolute prefix is now handled in html.c instead of cache.c
 * a fix for #name types of URLs made this necessary but it's better anyway
 *
 * Revision 0.10  1994/03/11  13:40:18  reinpost
 * absolute prefix is now handled in html.c instead of cache.c
 * a fix for #name types of URLs made this necessary but it's better anyway
 *
 * Revision 0.8  1994/02/25  20:27:04  reinpost
 * one message was added where really we should die
 *
 * Revision 0.8  1994/02/25  20:27:04  reinpost
 * one message was added where really we should die
 *
 * Revision 0.6  1994/02/17  21:25:21  reinpost
 * a horrible bug causing the script to crash on Solaris
 *
 * Revision 0.5  1994/02/17  10:23:45  reinpost
 * *** empty log message ***
 *
 * Revision 0.4  1994/02/10  12:50:29  reinpost
 * URLs of the form //host:port/path/file are now parsed that way
 *
 * Revision 0.3  1994/02/01  19:48:37  reinpost
 * URLs cannow remain unescaped after translation
 *
 * Revision 0.2  1994/01/25  14:59:31  reinpost
 * FORM ACTION attribute values are now translated
 *
 * Revision 0.1  1994/01/20  12:11:47  reinpost
 * there may still be some bugs in rel_to_abs - in particular, it doesn't
 * necessarily stay within the memory limits of the output buffer,
 * and it may omit skipping whitespace at some points
 *
 */

static char rcsid[] =
  "$Id: html.c,v 0.14 1994/05/17 13:40:14 reinpost Exp $";

/* some of the HTML parsing is already in util.c, by Rob McCool */
/* a modified version of this is included */

#include <stdio.h>
#if defined(SVR4) || defined(SOLARIS2)
#include <string.h>
#else
#include <strings.h>
#include <ctype.h>
#endif

#include "system.h"

#include "constants.h"
#include "util.h"
#include "log.h"  /* for log_if_debug */
#include "cleanup.h"  /* for conf_cache_prefix() */

/* the link scanner/converter */

/* the states for the link scanning automaton */

#define	OUTSIDE		1
#define	IN_TAG		2
#define	IN_WRONG_TAG	3
#define	IN_URL		4
#define ON_RELATIVE_URL 5
#define ON_ABSOLUTE_URL 6

/* 'link attributes' - emphatically not: LINK attributes */

static char a_la[]      = "HREF";
static char img_la[]    = "SRC";
static char form_la[]   = "ACTION";
char *no_la = (char *)0;

static char *is_link_tag(char **s, char *tag)
/* produces a string: NULL if s is not the tag, the link attribute for */
/* the tag otherwise */
/* it is the tag if a prefix compares and the next char is non-alphanumeric */
/* side effect: s is moved to the end of s */
/* warning: this is a hack */
{
  int len = strlen(tag);
  if (!strncasecmp(*s,tag,len) && !isalpha((*s)[len]))
  {
    *s += len;
    switch (tolower(tag[0]))
    {
      case 'a': return(a_la);	/* "A"    */
      case 'i': return(img_la);	/* "IMG"  */
      case 'f': return(form_la);/* "FORM" */
      default:  return(no_la);	/*  ???   */
    }
  }
  else
  {
    return(no_la);
  }
}

static int set_absolute_prefix(char *abs_prefix,char *url)
/*
 * assuming abs_prefix is large enough to hold the result,
 * url is copied into abs_prefix up to and including the rightmost slash
 * returns 0 iff the URL contains no slash
 */
{
  int on_slash =  rind(url,'/');
  if ((on_slash = rind(url,'/')) == -1)
  {
    /* URL does not contain a / */
    return(0);
  }

  strncpy(abs_prefix,url,on_slash+1);
  /* abs_prefix is url up to and including the rightmost slash */
  abs_prefix[on_slash+1]='\0';
  return(1);
}

static void do_translate_url(char *d, char *l, char *s, char *a_pref,
  int a_pref_len, int a_pref_path_offset, char *a_url,
      int state, int translate_escaped)
/*
 * translates the URL starting at l, ending at s,
 * into the buffer starting at d
 * no overflow check
 */
{
  char *s2;
  int c_pref_len;

  /* first prefix c_pref */
  {
    char *this_url = (char *)malloc(s-l+1);
    strncpy(this_url,l,s-l);
    this_url[s-l] = '\0';
    strcpy(d,conf_cache_prefix(this_url));
    free(this_url);
  }
  c_pref_len = strlen(d);
  d += c_pref_len;

  /* DON'T add separating / or ?  --  they must be part of c_pref already */
  
  s2 = d;
  if (state == ON_RELATIVE_URL)
  {
    /*
     * two cases: it starts with ? or #, in which case the file name
     * is left implicit, and we must supply a_url, or it doesn't, in
     * which case the document name is implicitly given, and a_pref
     * is to be used
     * a_pref is normally of the form http://host:post//some/path/
     */

    /* copy a_url or a_pref and point s2 at the next position to copy to */
    if (*l == '#' || *l == '?')
    {
      strcpy(s2,a_url);
      s2 += strlen(a_url);
    }
    else
    {
      strcpy(s2,a_pref);
      /* if the url is rooted (begins with /), strip off the path */
      if (*l == '/')
      {
	if (*(l+1) == '/')
	{
	  /* URL is of the form //host[:port]/bla/bla/bla */
	  /* I'm not sure if this is allowed, but it's used in Arjan's */
	  /* CSO gateway */
	  /* so assume a leading // is _not_ identical in meaning to / */
	  s2 += 5;  /* on first / of http:// so we can copy URL after it */
	}
	else
	{
	  s2 += a_pref_path_offset;
	}
      }
      else
      {
	s2 += a_pref_len;
      }
    }
  }

  /* append the original URL */
  strncpy(s2,l,s-l);
  s2 += s-l;

  /* and escape it, including a_pref (most importantly, the slashes) */
  *s2 = '\0';
  if (translate_escaped) escape_url_with_slashes(d);

#if ST_JUTTEMIS
  {
    char orig_url[s-l+1];
    char *err;
    strncpy(orig_url,l,s-l);
    orig_url[s-l] = '\0';
    err = (char *)malloc(MAX_STRING_LEN+strlen(orig_url)+strlen(d-c_pref_len)));
    sprintf(err, "'%s' to '%s'",orig_url,d-c_pref_len);
    log_if_debug("translating URL:",err);
    free(err);
  }
#endif /* ST_JUTTEMIS */

}

void translate_url(char *d, char *s, char *bare_url,
  int translate_escaped)
{
  char a_pref[HUGE_STRING_LEN+1];  /* should be more than enough */
  int a_pref_path_offset;

  /* set the extra variables - code copied from rel_to_abs */
  if (!set_absolute_prefix(a_pref,bare_url))
  {
    a_pref[0] = '\0';
    log_if_debug("oops, absolute URL doesn't contain a slash",bare_url);
  }
  if (strncasecmp(a_pref,"http://",7) || strlen(a_pref) < 8)
  {
    a_pref_path_offset = strlen(a_pref);
    log_if_debug("oops, no http: absolute prefix",a_pref);
  }

  do_translate_url(d, s, &s[strlen(s)],a_pref,strlen(a_pref),
    a_pref_path_offset, bare_url,
      strncasecmp(s,"http://",7) ? ON_RELATIVE_URL : ON_ABSOLUTE_URL,
	translate_escaped);
}

int rel_to_abs
  (char **to_dest, char **to_src, char *a_url,
   int dest_len, int translate_escaped)
/*
 * copy src to char, turning all relative links into absolute http-links
 * relative to a_url, which is this document's url without ? and # suffixes
 * prefixes of the form http: are omitted if not followed by a
 * host spec.  then, for all http-links, the 'cache prefix' c_pref is prefixed
 * e.g. src = <A HREF="e/is/mc/2"> ref = http://www.edu/broeh
 * -> dest = <A HREF="http://www.edu/broeh/e/is/mc/2">, or
 * src = <A HREF="http:/gloink" ref = http://cache/http://remote.edu
 * -> dest = <A HREF="http://cache/http://remote.edu/gloink"
 * or, if translate_escaped is set,
 * -> dest = <A HREF="http://cache/http:%2f%2fremote.edu%2fgloink"
 *
 * returns 2 iff dest_len wasn't large enough, 1 iff src wasn't
 * fully copied, and 0 otherwise; src must be null-terminated
 * leaves src and dest pointing at beyond the read/written strings
 * PROBLEM: still screws up your memory when returning 2 !!!
 */
/* this is my own hack, and nowhere near safety or completeness		*/
/* this is very simplistic and I didn't check with the specs		*/
/* modeled after url.pl by Oscar Nierstrasz				*/

/* KNOWN PROBLEM: doesn't properly handle omission of either prefix	*/
/* doesn't collapse relative paths (. and ..) either			*/
{
  char a_pref[HUGE_STRING_LEN+1];  /* should be more than enough */

  char *src = *to_src; char *dest = *to_dest;

  char *s = src; char *d = dest; char *l = s;  /* l for last copied */
  char *safe_s = s; char *safe_d = d;  /* if breaking, copy up to these */
  int a_pref_len;
  int a_pref_path_offset;
  int state = OUTSIDE;
  register char c;
  int rc;

 /*
  * compute a_pref by chopping off of a_url everything after the last slash
  * note: this causes a prefix such as http://myserver to fail
  * use http://myserver/ instead
  */
  if (!set_absolute_prefix(a_pref,a_url))
  /* the URL didn't contain a slash at all */
  {
    /* this case wasn't anticipated, we might return with an error code */
    a_pref[0] = '\0';
    log_if_debug("oops, absolute URL doesn't contain a slash",a_url);
  }
  a_pref_len = strlen(a_pref);

  /* compute path offset in a_pref, used for URLs starting with / */
  if (strncasecmp(a_pref,"http://",7) || a_pref_len < 8)
  {
    /* it's not an absolute http: URL prefix at all; ignore */
    /* this case wasn't anticipated, we might return with an error code */
    a_pref_path_offset = a_pref_len;
    log_if_debug("oops, no http: absolute prefix",a_pref);
  }
  else
  {
    /* put it on the last / in http://host[:port]/ */
    /* this is because rooted URLs bring their own / */
     char *s2 = &a_pref[6]; /* jump over initial string */
     while ((c = *++s2) && (c != '/')); /* jump onto next slash */
     a_pref_path_offset = s2-a_pref;  /* whether \0 or / there */
  }
  while (*s != '\0' && (d-dest <= dest_len))
  {
    if (state == OUTSIDE)
    {
      skip_whitespace(&s);
      while ((c = *s++) && c != '<')
	if (c == '\\')  /* escape character */
	  ++s;
      if (!c)
      {
	--s;
	goto done_processing;
      }
      safe_s = s-1; safe_d = d-1;  /* leaving OUTSIDE; we may need to back up */
      strncpy(d,l,s-l); d += s-l; l = s;
      state = IN_TAG;
    }
    else if (state == IN_TAG)
    /* may be a link, we don't know yet */
    {
      char *link_attr;
      skip_whitespace(&s);

      /* assume no strange characters in between, such as < > */
      if (!(link_attr = is_link_tag(&s,"A")) &&
 	  !(link_attr = is_link_tag(&s,"IMG")) &&
	  !(link_attr = is_link_tag(&s,"FORM")))
      /* none of the required tags was found - it must be a different tag */
      /* or not a tag at all, e.g. EOF */
      {
	state = IN_WRONG_TAG;
      }
      else
      {
	skip_whitespace(&s);
	/* a link tag was found - scan for the link attribute */
	/* always assume no nested < and > occur */
	/* comparing a prefix only */
	while (*s && (*s != '>') && strncasecmp(s,link_attr,strlen(link_attr)))
	  ++s;

	if (!*s)
	{
	  goto done_processing;
	}

	if (*s != '>')
	{
	  /* the attribute was found */
	  s += strlen(link_attr);
	  /* s points just beyond link attribute (ahem: <> LINK attribute!) */
	  skip_whitespace(&s);
	  /* next char MUST be an = */
	  while (*s++ != '=');
	  skip_whitespace(&s);  /* fixed by caruso@bellcore.com */
	  state = IN_URL;
	}
	else
	{
	  state = IN_WRONG_TAG;
	  /* missing attribute, to be precise */
	}
      }
    }
    else if (state == IN_WRONG_TAG)
    {

      /* tag of an uninteresting kind, broken tag, tag with missing attribute */
      /* skip to end of tag, assuming no intervening < and > in between */
      while (*s && *s != '>') ++s;
      if (!*s)
	goto done_processing;

      /* (*s == '>') */
      ++s;
      state = OUTSIDE;
      /* s now points just beyond the tag */
    }

    else if (state == IN_URL)
    {
      skip_whitespace(&s);  /* (courtesy of slshen@lbl.gov) */
      if (*s == '"')
      {
	/* HREF between quotes */
	++s;
	/* s is on first char of URL */
	strncpy(d,l,s-l); d += s-l; l = s;
	/* simply assume NO quotes until closing bracket */
	while (*s && *s != '"') ++s;
	/* we're on the closing quote now */
        /* don't bother to search for a closing > */
      }
      else
      {
	/* HREF not between quotes */
	/* s is on first char of URL */
	strncpy(d,l,s-l); d += s-l; l = s;
        while (*s && *s != '>') ++s;
        /* we're on the <A> closing bracket now */
      }

      if (!*s) goto done_processing;

      /* convert the URL that starts at l and ends at s */

      if (strncasecmp(l,"http:",5))
      /* remark: if (strncmp(l,"http:"),5) is perfectly OK with gcc -Wall */
      {
	/* it's not a http: type of URL */
	char *s2 = l;
	while ((c = *s2++) != '"' && c != '>' && c != ':');
	/* assumes under no conditions there's a " or > inside the URL */
	if (c == ':')
	/* another type of URL: leave it; on todo list */
	{
	  /* treat as ordinary text */
	  strncpy(d,l,s-l);
	  d += s-l;
	  l = s;
	  state = OUTSIDE;
	}
	else /* c == '"' */
	/* relative URL without http:/ prefix */
	{
	  /* beg & pray for this relative URL to be relative to a_pref */
	  state = ON_RELATIVE_URL;
	}
      }
      else if (l[5] != '/' || l[6] != '/')
      /* a relative http URL */
      {
	/* strip off the http: prefix and treat like a relative URL */
	l += 5;
	state = ON_RELATIVE_URL;
      }
      else /* it is an absolute URL */
      {
	state = ON_ABSOLUTE_URL;
      }
    }

    else /* (state == ON_RELATIVE_URL || state == ON_ABSOLUTE_URL) */
    /* between l and s, we have a URL to be converted */
    {
      do_translate_url(d,l,s,a_pref,a_pref_len,a_pref_path_offset,a_url,
	state,translate_escaped);
      d += strlen(d);
      l = s;
      state = OUTSIDE;
    }
  }

  done_processing:
  /* now we're either broken at safe_s and safe_s or we aren't and */
  /* s is at terminating null char, which must always be there! */

  if (d-dest > dest_len)
  {
    rc = 2;  /* and leave the country if possible */
  }
  else if (state != OUTSIDE)
  {
    rc = 1;
  }
  else
  {
    /* copy the last piece */
    strncpy(d,l,s-l);
    d += s-l;
    safe_s = s; safe_d = d;  /* everything was copied safely */
    rc = 0;
  }

  *safe_d = '\0';

  *to_src = safe_s; *to_dest = safe_d;

  return rc;
}

int looks_like_html(char *buf, int nb)
/* the first nb bytes of buf look like they are part of an HTML document */
{
  /* simply scan for some well-known phrases */
  char *s = buf;
  char *past = buf + nb;
  register char c;
  while (s < past && (c = *s++))
  {
    if (c == '<')
    {
      if ( !strncasecmp(s,"TITLE>",6)
	|| !strncasecmp(s,"HEAD>",5)
	|| !strncasecmp(s,"H1>",3)
	|| !strncasecmp(s,"H2>",3)
	|| !strncasecmp(s,"A HREF>",7)
	|| !strncasecmp(s,"A NAME>",7)
	|| !strncasecmp(s,"P>",2))
      /* we've got an opening HTML tag; simply assume we have HTML	*/
      /* this probably both classifies non-HTML as HTML and vice versa	*/
      {
	return 1;
      }
    }
  }
  return 0;
}
