# MK_IDENTLIST
#prepare C source files for making an inverted index, or whatever, using
#identlist and identlist1 to: delete quoted strings, comments; 
#convert punctuation to white space; delete numbers and all upper case words;
#the result, with some stuff flagged for later reversal,
#should be suitable for making an inverted index of identifiers in each 
#source file.
#
#if use "invert" from the "bib" suite of programs, can exclude, as common
#words, the C keywords and the system library
#
#(NOTE: as a test, the output of the loop containing the filters is 
#sent through tr, sort and uniq, to produce a word list)

DEFAULTDIR="."
CDIR=`dirname $1`
if [ ${CDIR} = "." ]
then
	CDIR=${DEFAULTDIR}
fi

for i in $*
do
	FILE=`basename ${i}`
	echo processing file: $i >&2
	cat ${CDIR}/${FILE}|identlist|identlist1|
	sed -e '/^[ 	]*$/d'
done |tr -s " 	" "\012\012"|sort|uniq			#produce a word list


#temporary++++++++++++++++++++++++++++++++++++++++++++++++++
exit

#to make an inverted index, replace the above loop by the following:

for i in $*
do
	FILE=`basename ${i}`
	echo processing file: $i
	cat ${CDIR}/${FILE}|identlist|identlist1|
	sed -e 's/\([a-zA-Z_0-9]\)\.[^ \t]*[ \t]/\1\qsq\ /g' \
		-e 's/\([a-zA-Z_0-9]\)\-\>[^ \t]*[ \t]/\1qsq\ /g' \
		-e 's/\([a-zA-Z0-9]\)[_]\([a-zA-Z0-9]\)/\1quq\2/g' |
	sed -e '/^[ 	]*$/d' 		>${FILE}
	FILELIST=${FILELIST}" "${FILE}
done


#NOTE: if the above changes are made, mk_identlist should be run in 
#an empty (scratch) directory;
#intermediate files of the same name as the C source files are created;
#the directory with the C source files can be given as DEFAULTDIR or as
#the absolute pathname of the first file in the list;

#NOTE: the following is an example of processing to get an inverted index.

invert -ccommon -k5000 -l30 ${FILELIST}
mv INDEX INDEX.long
cat INDEX.long|
sed -e 's/quq/\_/g' -e 's/qsq/STRUCT/g' \
	-e 's/\([^0-9]\)[0-9][0-9]*\/[0-9][0-9]*\([^0-9]\)/\1\2/g'	\
	-e 's/\([^0-9]\)[0-9][0-9]*\/[0-9][0-9]*$/\1/' |
sort -o INDEX

