/*
stats - a LifeLines database statistical extraction report program
	by Jim Eggert (eggertj@atc.ll.mit.edu)
	Version 1 (14 Dec 1992)
	Version 2 (17 Dec 1992) added restrictions, 
		unity, general GEDCOM tag, e<place>, and today
	Version 3 (20 Dec 1992) added sorting
	Version 4 (30 Jan 1993) bugfix, modified find_bin to use requeue()
		Requires LifeLines v2.3.3 or later
	Version 5 (30 Jun 1993) changed bubblesort to listsort
		listsort code by John Chandler (JCHBN@CUVMB.CC.COLUMBIA.EDU)
	Version 6 (2 July 1993) added firstname, changed user interface
		Requires LifeLines v2.3.4 or later

This LifeLines report program computes mean statistics of various
quantities binned over other quantities.  The quantities it knows
about are ages at and dates of birth, christening, first and last
marriage, first and last child's birth, death, burial, and today;
the number of children, siblings, and marriages; and sex, surname,
first name, soundex, and any simple GEDCOM tag.  These can be combined
nearly arbitrarily and evaluated over the whole database, or
restricted to either ancestors or descendants of a chosen individual.
Further restrictions on the individuals included in the statistics can
be based on any quantity that the program knows about.  The program
will optionally print out the names of all the individuals included in
the statistics.

For example, you can produce statistics of
    the age at death of as a function of birth year,
	dage vs byear
    the number of children of females named Smith as a
    function of year of first marriage,
	kids vs myear | sex = F & surname = Smith
    the number of spouses for male vs female blacksmiths,
	families vs sex | occu = blacksmith
    the age at last childbirth as a function of place of marriage.
	qage vs mplace
    the names of all Joneses who lived to be greater than 80
	unity vs unity | surname = Jones & dage > 80
All this without writing any programs of your own.

If a particular statistic for an individual is unavailable, and if the
global variable not_strict is nonzero (as it is in the distribution
version of this report, then certain guesses are allowed as to the
value of that statistic.  So far, these guesses are few.  Birth year
and month are guessed from baptismal date, and death year and month
are guessed from burial date.

The user is prompted for what quantity to plot vs what to bin over.
Each is to be given as a specification string of the form
	e<time> or e<place> or <count> or <label>
   where e is the event code, <time> and <place> are place codes,
   <count> is the count codes, and <label> is the label code.
event codes
	b birth (can get year or month from christening)
	c christening (can get year or month from birth)
	d death (can get year or month from burial)
	e burial (entombment) (can get year or month from death)
	m marriage (first)
	n marriage (last)
	p parturition or childbirth (first)
	q parturition (last)
	t today (most useful with age)
time codes
	age (in years)
	year
	month
	day
place codes
	place (tplace is always the empty string)
count codes
	unity (always returns one, good for simple histograms or averages)
	kids
	families
	siblings
	siblingorder
label codes
	sex
	surname
	firstname
	soundex
	<tag> (returns the value of the first GEDCOM node with this tag)
	      (a useful example is OCCU for occupation)

For example, to compute statistics of age at death vs year of first
marriage, enter dage then myear then the bin size for myear.
To compute statistics of the number of children versus occupation,
enter kids then occu.

The statistics can also be restricted according to any of the known
characteristics.  The user is prompted for any constraints, to be entered
one at a time.  For example, if you want to accumulate statistics only
if the person is female, enter the constraint
	sex = F
or if you want to allow only people born after 1800
	byear > 1800

Multiple constraints are allowed, just enter return if you don't want
any more.  Please note that the constraints are composed of three
parts: left hand side, inequality symbol, and right hand side.  These
three parts must be separated by one or more spaces.

Case is not important.  You can't compute averages of label codes,
as that would not be meaningful.
*/

global(ret_val)   /* return value */
global(ret_type)  /* return type 0=none, 1=string, 2=integer */
global(bin)       /* returned by find_bin */
global(minbin)    /* maintained by find_bin */
global(maxbin)    /* ditto */
global(deltabin)  /* zero for binning over non-numeric types */
global(not_strict)/* 0=strict dates, 1=estimate dates */
global(accuracy)  /* 10^digits after the decimal */
global(log)       /* 0=don't, 1=do write an individual log */
global(rlist)     /* list of restriction variable names */
global(tlist)     /* list of restriction variable types */
global(clist)     /* list of restriction inequality codes */
global(vlist)     /* list of restriction values */
global(comparison)/* for sorts */
global(token)     /* for tokens */
global(untoken)   /* rest of string after token is taken */

/* compute a value type from the value specification string */
proc value_type(spec_string) {
    set(ret_val,0)
    set(initial,save(trim(spec_string,1)))
    if (not(and(
	    and(
	    and(
	    and(strcmp(initial,"B"),
		strcmp(initial,"C")),
	    and(strcmp(initial,"M"),
		strcmp(initial,"N"))),
	    and(
	    and(strcmp(initial,"P"),
		strcmp(initial,"Q")),
	    and(strcmp(initial,"D"),
		strcmp(initial,"E")))),
		strcmp(initial,"T")))) {
	if (not(and(
		and(strcmp(concat(initial,"AGE"),spec_string),
		    strcmp(concat(initial,"YEAR"),spec_string)),
		and(strcmp(concat(initial,"MONTH"),spec_string),
		    strcmp(concat(initial,"DAY"),spec_string))))) {
	    set(ret_val,2)
	}
	elsif (not(strcmp(concat(initial,"PLACE"),spec_string))) {
	    set(ret_val,1)
	}
    }
    if (not(ret_val)) {
	if  (not(and(
		    and(
		    and(strcmp(spec_string,"UNITY"),
			strcmp(spec_string,"KIDS")),
		    and(strcmp(spec_string,"FAMILIES"),
			strcmp(spec_string,"SIBLINGS"))),
			strcmp(spec_string,"SIBLINGORDER")))) {
	    set(ret_val,2)
	}
	else { set(ret_val,1) } /* SEX, SURNAME, FIRSTNAME,
				   SOUNDEX, or GEDCOM tag */
    }
}

/* get the specified value for a person */
proc get_val(person,spec_string) {
    list(namelist)
    set(ret_val,0)
    set(ret_type,0)
    set(event,0)
    set(initial,save(trim(spec_string,1)))
    if (not(strcmp(initial,"B"))) {
	set(e,birth(person))
	set(e1,baptism(person))
	set(event,1)
    }
    elsif (not(strcmp(initial,"C"))) {
	set(e,baptism(person))
	set(e1,birth(person))
	set(event,1)
    }
    elsif (not(strcmp(initial,"D"))) {
	set(e,death(person))
	set(e1,burial(person))
	set(event,1)
    }
    elsif (not(strcmp(initial,"E"))) {
	set(e,burial(person))
	set(e1,death(person))
	set(event,1)
    }
    elsif (not(strcmp(initial,"M"))) {
	families(person,fam,spouse,fnum) {
	    if (eq(event,0)) {
		set(e,marriage(fam))
		set(event,1)
	    }
	}
    }
    elsif (not(strcmp(initial,"N"))) {
	families(person,fam,spouse,fnum) {
	    set(e,marriage(fam))
	    set(event,1)
	}
    }
    elsif (not(strcmp(initial,"P"))) {
	families(person,fam,spouse,fnum) {
	    if (eq(event,0)) {
		children(fam,child,cnum) {
		    if (eq(event,0)) {
			set(e,birth(child))
			set(e1,baptism(child))
			set(event,1)
		    }
		}
	    }
	}
    }
    elsif (not(strcmp(initial,"Q"))) {
	families(person,fam,spouse,fnum) {
	    children(fam,child,cnum) {
		set(e,birth(child))
		set(e1,baptism(child))
		set(event,1)
	    }
	}
    }
    elsif (not(strcmp(initial,"T"))) {
	set(e,gettoday())
	set(event,1)
    }
    if (eq(event,1)) {
	if (e) { extractdate(e,day,month,year) }
	if (e1) { extractdate(e1,day1,month1,year1) }
	if (not(strcmp(spec_string,concat(initial,"YEAR")))) {
	    set(event,2)
	    if (year) { set(ret_val,year) set(ret_type,2) }
	    elsif (and(not_strict,year1)) {
		set(ret_val,year1) set(ret_type,2)
	    }
	}
	elsif (not(strcmp(spec_string,concat(initial,"MONTH")))) {
	    set(event,2)
	    if (month) { set(ret_val,month) set(ret_type,2) }
	    elsif (and(not_strict,month1)) {
		set(ret_val,month1) set(ret_type,2)
	    }
	}
	elsif (not(strcmp(spec_string,concat(initial,"DAY")))) {
	    set(event,2)
	    if (month) { set(ret_val,day) set(ret_type,2) }
	    elsif (and(not_strict,day1)) {
		set(ret_val,day1) set(ret_type,2)
	    }
	}
	elsif (not(strcmp(spec_string,concat(initial,"AGE")))) {
	    set(event,2)
	    set(byear,0)
	    set(b,birth(person))
	    if (b) { extractdate(b,bday,bmonth,byear) }
	    if (and(not(byear),not_strict)) {
		set(b,baptism(person))
		if (b) { extractdate(b,bday,bmonth,byear) }
	    }
	    if (byear) {
		if (year) {
		    set(ret_val,sub(year,byear)) set(ret_type,2)
		}
		elsif (year1) {
		    set(ret_val,sub(year1,byear)) set(ret_type,2)
		}
	    }
	}
	elsif (not(strcmp(spec_string,concat(initial,"PLACE")))) {
	    set(event,2)
	    set(ret_val,save(place(e)))
	    if (and(not_strict,not(strcmp(ret_val,"")))) {
		set(ret_val,save(place(e1)))
	    }
	    set(ret_type,1)
	}
    }
    if (ne(event,2)) {
	if (not(strcmp(spec_string,"KIDS"))) {
	    families(person,fam,spouse,fnum) {
		set(ret_val,add(nkids,nchildren(fam)))
	    }
	    set(ret_type,2)
	}
	elsif (not(strcmp(spec_string,"FAMILIES"))) {
	    set(ret_val,nfamilies(person))
	    set(ret_type,2)
	}
	elsif (not(strcmp(spec_string,"SIBLINGS"))) {
	    if (fam,parents(person)) {
		set(ret_val,nchildren(fam))
		set(ret_type,2)
	    }
	}
	elsif (not(strcmp(spec_string,"SIBLINGORDER"))) {
	    if (fam,parents(person)) {
		set(ret_type,2)
		set(personkey,save(key(person)))
		children(fam,child,cnum) {
		    if (not(strcmp(key(child),personkey))) {
			set(ret_val,cnum)
		    }
		}
	    }
	}
	elsif (not(strcmp(spec_string,"UNITY"))) {
	    set(ret_val,1)
	    set(ret_type,2)
	}
/* The next four lines will work even if you comment them out.
   Sex is a powerful force, I guess. */
	elsif (not(strcmp(spec_string,"SEX"))) {
	    set(ret_val,save(sex(person)))
	    set(ret_type,1)
	}
	elsif (not(strcmp(spec_string,"SURNAME"))) {
	    set(ret_val,save(surname(person)))
	    set(ret_type,1)
	}
	elsif (not(strcmp(spec_string,"FIRSTNAME"))) {
	    extractnames(inode(person), namelist, ncomp, sindx)
	    if( or( gt(sindx,1), gt(ncomp,sindx))) {
		set(gindx,1) if(eq(sindx,1)) { set(gindx,2) }
		set(ret_val, save(getel(namelist, gindx)))
	    }
	    else { set(ret_val,"") }
	    set(ret_type,1)
	}
	elsif (not(strcmp(spec_string,"SOUNDEX"))) {
	    set(ret_val,save(soundex(person)))
	    set(ret_type,1)
	}
	else {
	    fornodes(inode(person),node) {
                if (and(not(ret_type),not(strcmp(tag(node),spec_string)))) {
		    set(ret_val,save(value(node)))
		    set(ret_type,1)
		}
	    }
	}
    }
}

proc find_bin(xvalue,ycounts,ysums,xvals) {
    if (gt(deltabin,0)) { /* numeric data type */
	if (lt(maxbin,minbin)) { /* first time through */
	    set(minbin,sub(xvalue,mod(xvalue,deltabin)))
	    set(maxbin,add(minbin,deltabin))
	    enqueue(ycounts,0)
	    enqueue(ysums,0)
	    set(bin,1)
	}
	else {
	    while (lt(xvalue,minbin)) {
		requeue(ycounts,0)
		requeue(ysums,0)
		set(minbin,sub(minbin,deltabin))
	    }
	    while (ge(xvalue,maxbin)) {
		set(maxbin,add(maxbin,deltabin)) 
		/* Don't need to extend array if xvalue >= maxbin,
		   because setel automagically does this for us. */
	    }
	    set(bin,add(div(sub(xvalue,minbin),deltabin),1))
	}
    }
    else { /* unsorted string data type */
	set(bin,0)
	forlist(xvals,xlabel,xnum) {
	    if (not(strcmp(xvalue,xlabel))) { set(bin,xnum) }
	}
	if (not(bin)) {
	    enqueue(xvals,xvalue)
	    enqueue(ycounts,0)
	    enqueue(ysums,0)
	    set(bin,add(xnum,1))
	}
    }
}

proc filter_person(person) {
    set(pass,1)
    forlist(rlist,restriction,rnum) {
	if (pass) {
	    set(rtype,getel(tlist,rnum))
	    set(inequality,getel(clist,rnum))
	    set(cvalue,getel(vlist,rnum))
	    call get_val(person,restriction)
	    set(pass,0)
	    if (eq(ret_type,rtype)) {
		if (eq(rtype,2)) { /* numeric */
		    if (not(strcmp(inequality,"="))) {
			if (eq(ret_val,cvalue)) { set(pass,1) }
		    }
		    elsif (not(strcmp(inequality,"!="))) {
			if (ne(ret_val,cvalue)) { set(pass,1) }
		    }
		    elsif (not(strcmp(inequality,">"))) {
			if (gt(ret_val,cvalue)) { set(pass,1) }
		    }
		    elsif (not(strcmp(inequality,"<"))) {
			if (lt(ret_val,cvalue)) { set(pass,1) }
		    }
		    elsif (not(strcmp(inequality,">="))) {
			if (ge(ret_val,cvalue)) { set(pass,1) }
		    }
		    elsif (not(strcmp(inequality,"<="))) {
			if (le(ret_val,cvalue)) { set(pass,1) }
		    }
		}
		else { /* string */
		    if (not(strcmp(inequality,"="))) {
			if (eq(strcmp(ret_val,cvalue),0)) { set(pass,1) }
		    }
		    elsif (not(strcmp(inequality,"!="))) {
			if (ne(strcmp(ret_val,cvalue),0)) { set(pass,1) }
		    }
		    elsif (not(strcmp(inequality,">"))) {
			if (gt(strcmp(ret_val,cvalue),0)) { set(pass,1) }
		    }
		    elsif (not(strcmp(inequality,"<"))) {
			if (lt(strcmp(ret_val,cvalue),0)) { set(pass,1) }
		    }
		    elsif (not(strcmp(inequality,">="))) {
			if (ge(strcmp(ret_val,cvalue),0)) { set(pass,1) }
		    }
		    elsif (not(strcmp(inequality,"<="))) {
			if (le(strcmp(ret_val,cvalue),0)) { set(pass,1) }
		    }
		}
	    }
	}
    }
    set(ret_val,pass)
}

proc stat_person(person,xstring,ystring,ycounts,ysums,xvals) {
    call filter_person(person)
    if (ret_val) {
	call get_val(person,ystring)
	if (eq(ret_type,2)) {
	   set(value,ret_val)
	    call get_val(person,xstring)
	    if (ret_type) {
		if (log) {
		    key(person) col(8) name(person) col(50)
		    if (eq(ret_type,1)) { ret_val } else { d(ret_val) }
		    col(65) d(value) "\n"
		}
		call find_bin(ret_val,ycounts,ysums,xvals)
		setel(ycounts,bin,add(getel(ycounts,bin),1))
		setel(ysums,bin,add(getel(ysums,bin),value))
	    }
	}
    }
}

proc print_frac(frac) {
    set(check,div(accuracy,10))
    while (gt(check,1)) {
	if (lt(frac,check)) { "0" }
	set(check,div(check,10))
    }
    d(frac)
}

proc compare(astring,bstring) {
    set(comparison,strcmp(astring,bstring))
}

proc listsort(alist,ilist) {
/*
   Input:  alist  - list of strings (could use numbers, instead)
   Output: ilist  - list of index pointers into "alist" in sorted order
   Needed: compare- external "function" of two arguments to set global
                    variable "comparison" to -1,0,+1 according to relative
                    order of the two arguments

   Uses an internal list as a work area.  This is significantly faster
   than a bubble sort.

*/
        list(wlist)

        set(list_size,length(alist))
        set(completion,0)
        set(index,0)
        while (lt(index,list_size)) {
                set(index,add(index,1))
                setel(ilist,index,index)
        }
        while (lt(completion,list_size)) {
                set(count,0)
                while (lt(count,list_size)) {
                        set(count,add(count,1))
                        set(index,dequeue(ilist))
                        set(test,getel(alist,index))
                        if(eq(count,1)) {
                                set(work_min,test)
                                set(work_max,test)
                        }
                        call compare(test,work_max)
                        if(ge(comparison,0)) {
                                enqueue(wlist,index)
                                set(work_max,test)
                        } else { call compare(test,work_min)
                                while(gt(comparison,0)) {
                                        enqueue(ilist,dequeue(wlist))
                                        set(work_min,getel(alist,getel(wlist,1)))
                                        call compare(test,work_min)
                                }
                                requeue(wlist,index)
                                set(work_min,test)
                        }
                }
                set(completion,length(wlist))
                while (not(empty(wlist))) { enqueue(ilist,dequeue(wlist)) }
        }
}

proc get_token(input) {
/*  Parse a token from the input string.
    Tokens are separated by one or more spaces.
    Set global parameter token to the first token string.
    Set global parameter untoken to the rest of the string after first token.
*/
/* strip leading spaces */
    set(untoken,save(input))
    set(first_space,index(untoken," ",1))
    while (eq(first_space,1)) {
	set(untoken,save(substring(untoken,2,strlen(untoken))))
	set(first_space,index(untoken," ",1))
    }
/* get token and untoken */
    if (not(first_space)) {
	set(token,save(untoken))
	set(untoken,save(""))
    }
    else {
	set(token,save(substring(untoken,1,sub(first_space,1))))
	set(untoken,save(
	    substring(untoken,add(first_space,1),strlen(untoken))))
    }
}

proc testmain() {
    set(untoken,"This is a test.")
    while(strlen(untoken)) {
	call get_token(untoken)
	token "\n"
    }
}

proc main() {

    set(not_strict,1)
    set(accuracy,100)  /* compute and print to 0.01 */
    list(ysums)   /* bin sums */
    list(ycounts) /* bin counts */
    list(xvals)   /* bin values */
    list(relist)  /* contains restriction equations */
    list(rlist)   /* contains restriction LHSs */
    list(tlist)   /* contains restriction LHS types */
    list(clist)   /* contains restriction inequalities */
    list(vlist)   /* contains restriction RHSs */
    list(ilist)   /* index list for sorting */
    indiset(people)

    set(ret_val,1)
    while (eq(ret_val,1)) {
	getstrmsg(ystring,"Collect statistics of ")
	set(ystring,save(upper(ystring)))
	call value_type(ystring)
	if (eq(ret_val,1)) {
	    print("Can't do statistics on ") print(ystring) print("\n")
	}
    }
    getstrmsg(xstring,"versus ")
    set(xstring,save(upper(xstring)))
    call value_type(xstring)
    if (eq(ret_val,1)) { set(deltabin,0) }
    else {
	while (not(deltabin)) {
	    getintmsg(deltabin,"with bin width ")
	    if (eq(deltabin,0)) {
		print("Can't have zero bin width\n")
	    }
	}
	set(minbin,1000000)
	set(maxbin,neg(minbin))
    }
    getintmsg(log,"logging people (0=no, 1=yes) ")
    getintmsg(who,"over set (0=all, 1=descendants, 2=ancestors) ")
    if (who) { getindimsg(of,"of ") }
    set(restriction,1)
    while (restriction) {
	getstrmsg(restriction_equation,"restricted by ")
	if (strcmp(restriction_equation,"")) {
	    set(restriction_equation,save(restriction_equation))
	    call get_token(restriction_equation)
	    set(restriction,save(upper(token)))
	    call value_type(restriction)
	    call get_token(untoken)
	    set(inequality,save(token))
	    if (inequality) {
		if (eq(ret_val,1)) {
		    set(value,save(untoken))
		}
		else {
		    set(value,atoi(untoken))
		}
		enqueue(rlist,restriction)
		enqueue(tlist,ret_val)
		enqueue(clist,inequality)
		enqueue(vlist,value)
		enqueue(relist,restriction_equation)
	    }
	}
	else { set(restriction,0) }
    }
    if (log) {
	"-------------------------------------------------------------------\n"
	"Log of individuals and their values used in the statistics\n"
	"-------------------------------------------------------------------\n"
	"key" col(8) "name" col(50) xstring col(65) ystring "\n"
	"-------------------------------------------------------------------\n"
    }

    if (eq(who,0)) {
	forindi(person,pnum) {
	    call stat_person(person,xstring,ystring,ycounts,ysums,xvals)
	}
    }
    else {
	addtoset(people,of,0)
	if (eq(who,1)) { set(people,descendentset(people)) }
	elsif (eq(who,2)) { set(people,ancestorset(people)) }
	addtoset(people,of,0)
	forindiset(people,person,pval,pnum) {
	    call stat_person(person,xstring,ystring,ycounts,ysums,xvals)
	}
    }

    set(sort_it,0)
    if (eq(deltabin,0)) {
	getintmsg(sort_it,"Sort the output? (0=no, 1=yes)")
    }
    if (sort_it) { call listsort(xvals,ilist) }
    else { forlist(ycounts,ycount,ynum) { enqueue(ilist,ynum) } }

    if (log) {
	"-------------------------------------------------------------------\n"
    }
    "-------------------------------------------------------------------\n"
    "Statistics of " ystring " binned by " xstring
    if (not(empty(relist))) {
	" subject to"
	while(not(empty(relist))) {
	    "\n"
	    dequeue(relist)
	}
    }
    "\n"
    "-------------------------------------------------------------------\n"
    col(8) xstring if (gt(deltabin,0)) { " range" }
    col(40) "bin" col(50) "total" col(60) "average\n"
    "bin" col(8)
    if (gt(deltabin,0)) { "from" col(24) "to" set(binx,minbin) }
    else { "label" }
    col(40) "count" col(50) ystring col(60) ystring "\n"
    "-------------------------------------------------------------------\n"
    set(bin,0)
    set(allycount,0)
    set(allysum,0)
    forlist(ilist,index,num) {
	set(ycount,getel(ycounts,index))
	set(ysum,getel(ysums,index))
	set(bin,add(bin,1))
	d(bin) col(8)
	if (gt(deltabin,0)) { d(binx)
	    set(binx,add(binx,deltabin))
	    col(24) d(binx)
	}
	else { getel(xvals,index) }
	col(40) d(ycount) set(allycount,add(allycount,ycount))
	col(50) d(ysum) set(allysum,add(allysum,ysum))
	col(60)
	if (gt(ycount,0)) {
	    d(div(ysum,ycount)) "."
	    set(frac,div(mul(mod(ysum,ycount),accuracy),ycount))
	    call print_frac(frac)
	}
	else { "0.000" }
	"\n"
    }
    "-------------------------------------------------------------------\n"
    "all" col(8)
    if (gt(deltabin,0)) { d(minbin) col(24) d(maxbin) }
    else { xstring }
    col(40) d(allycount)
    col(50) d(allysum)
    col(60)
    if (gt(allycount,0)) {
	d(div(allysum,allycount)) "."
	set(frac,div(mul(mod(allysum,allycount),accuracy),allycount))
	call print_frac(frac)
    }
    else { "0." call print_frac(0) }
    "\n"
    "-------------------------------------------------------------------\n"
}
