: '@(#) ngsizes 1.7 93/08/18 00:09:10'
#
# ngsizes - Generate disk usage summary for USENET newsgroups.
#
# Copyright 1990-1993, Unicom Systems Development.  All rights reserved.
# See accompanying README file for terms of distribution and use.
#
# Usage:
#
#   ngsizes [-D] [-b breakdown_list] [-t threshold]
#
#	-t  Specifies only groups using "threshold" or more disk blocks should
#	    be reported.  The default is defined by the "threshold" parameter
#	    below.
#
#	-b  Specifies how usage should be broken down versus age.  For example,
#	    saying "-b 0,7,14" will report usage in three columns:  the total
#	    usage, the usage by articles a week or older, and the usage by
#	    articles two weeks or older.  The default is defined by the
#	    "breakdown" parameter below.
#
#	-D  For debugging, the temporary files will be maintained.
#

USAGE="usage: $0 [-b breakdown_list] [-t threshold]"


##############################################################################
#
# Site-specific definitions.
#

SPOOLDIR=/usenet/spool/news	# Pathname to the Usenet spool directory.
ACTIVE=/usenet/lib/news/active	# Pathname to the list of active newsgroups.
DU=du				# Pathname to the enhanced "du" command.

#
# Pick one of the following.  It specifies how to account for the disk
# space used by cross-posted articles.
#
DU_LINKOPTS=-l	# Count xposted article every time it appears.
#DU_LINKOPTS=	# Count xposted article only first time it is encountered.
#DU_LINKOPTS=-L	# Average usage across newsgroups in which it appears.

#
# Pick one of the following.  It specifies how you want usage reported.
#
DU_BSIZE=-k	# Report usage in KB.
#DU_BSIZE=	# Report usage however du normally does.

#
# Specify the default reporting threshold.  Newsgroups with usage below
# this value will not be reported.  The units for this number depend upon
# whatever you specified above for DU_BSIZE.  This default can be overridden
# by the "-t" command line option.
#
DFLT_THRESHOLD=0

#
# Specify the default for the breakdown.  There will be one column in
# the output for every number in the breakdown list.  Each value in the
# list specifies a number of days, and the corresponding column will show
# the disk usage by articles that are that many days or older.  This default
# can be overridden by the "-b" command line option.
#
DFLT_BREAKDOWN=0,1,3,5,7,15

#
# Work Files:
#	$TMP.read	Readership statistics.
#	$TMP.ngs	List of all newsgroups to check.
#	$TMP.du		Disk usage for all directories in the news spool dir.
#
TMP=/tmp/ngsz$$
trap 'rm -f $TMP.* ; exit 1' 1 2 3

#
# End of site-specific customizations.
#
##############################################################################


#
# Initialize.
#
debug=0
threshold=$DFLT_THRESHOLD
breakdown=$DFLT_BREAKDOWN

#
# Crack the command line options.
#
set -- `getopt 'Db:t:' $*`
if [ $? -ne 0 ] ; then
	echo "$USAGE" 1>&2
	exit 1
fi
while : ; do
	case "$1" in
	-D)  TMP=/tmp/ngsz debug=1 ; trap '' 1 2 3	; shift ;;
	-b)  breakdown="$2"				; shift 2 ;;
	-t)  threshold="$2"				; shift 2 ;;
	--)  shift ; break ;;
	*)   echo "$USAGE" 1>&2				; exit 1 ;;
	esac
done
if [ $# -ne 0 ] ; then
	echo "$USAGE" 1>&2
	exit 1
fi

#
# Verify we can find the active file.
#
if [ ! -r $ACTIVE ] ; then
	echo "$0: file '$ACTIVE' not found or unreadable" 1>&2
	[ $debug -eq 0 ] && rm -f $TMP.*
	exit 1
fi

#
# Get a count of the readers for each newsgroup.
# Output format will be "readership_count newsgroup_name"
#
for newsrc in `awk -F: '{ print $6 "/.newsrc" }' /etc/passwd | sort -u` ; do
	test -f $newsrc && sed -n 's/:.*//p' $newsrc
done | sort | uniq -c > $TMP.read

#
# Build a sorted list of all known newsgroups from the active file.
#
sed '
	s/[ 	].*//
	/^$/d
' $ACTIVE | sort -u > $TMP.ngs

#
# Scan the spool directory for disk usage.  Convert the newsgroup pathname
# to a newsgroup name, and move it to the first field on the line.
# Output format will be "newsgroup_name usage usage ..."
#
if [ $debug -ne 0 -a -f $TMP.du ] ; then
	: suppress scan for debugging
else
	$DU -ir $DU_LINKOPTS $DU_BSIZE -c "$breakdown" $SPOOLDIR	\
	| sed								\
		-e 's/^\(.*\)	\([^	]*\)$/\2	\1/'		\
		-e "s!$SPOOLDIR/!!"					\
		-e "s!/!.!g"						\
	| sort -u							\
		> $TMP.du
fi

#
# Generate the report.
#
(
	echo "BREAKDOWN $breakdown" | sed 's/,/ /g'
	echo "THRESHOLD $threshold"
	sed 's/^/READERS /' $TMP.read
	join $TMP.du $TMP.ngs | sort -rn +1 | sed 's/^/NEWSGROUP /'
) | awk '

BEGIN {
	LINE_WIDTH = 79		# maximum length of a line
	NG_WIDTH = 26		# width of field to print newsgroup in
	READR_WIDTH = 4		# width of field to print number of readers in
	FRONT_FMT = "%-" NG_WIDTH "." NG_WIDTH "s" "%" READR_WIDTH "s"
}

#
# Record "BREAKDOWN n1 n2 ..."
#   Defines the format for the newsgroup usage lines.  Each "n" corresponds
#   to one column in the newsgroup usage line, and specifies the age of
#   articles which consume this amount of disk space.
#
$1 == "BREAKDOWN" {
	num_breakdn = NF - 1
	FIELD_WIDTH = ( LINE_WIDTH - (NG_WIDTH+READR_WIDTH) ) / num_breakdn
	if ( FIELD_WIDTH > 8 )
		FIELD_WIDTH = 8
	FIELD_FMT = "%" FIELD_WIDTH "s"
	printf(FRONT_FMT,"newsgroup","read")
	for ( i = 0 ; i < num_breakdn ; ++i )
		printf(FIELD_FMT,sprintf("%ddays",$(i+2)))
	printf("\n")
	next
}

#
# Record "THRESHOLD n"
#   Indicates we only want to see newsgroups using "n" or more blocks.
#
$1 == "THRESHOLD" {
	threshold = $2
	next
}

#
# Record "READERS n ng"
#   Indicates that newsgroup "ng" has "n" readers.
#
$1 == "READERS" {
	num_readers[$3] = $2
	next
}

#
# Record "NEWSGROUP ng n1 n2 ..."
#   Indicates the disk usage of newsgroup "ng".  Each "n" specifies the
#   diskspace used by articles "ndays" or older, where "ndays" is defined
#   by the BREAKDOWN record.
#
$1 == "NEWSGROUP" {
	if ( $3 >= threshold ) {
		if ( num_readers[$2] == "" )
			num_readers[$2] = 0
		printf(FRONT_FMT,$2,num_readers[$2])
		for ( i = 0 ; i < num_breakdn ; ++i )
			printf(FIELD_FMT,$(i+3))
		printf("\n")
	}
	next
}

{
	printf("ngsizes - bad line '%s'\n", $0) | "cat 1>&2"
}

'

[ $debug -eq 0 ] && rm -f $TMP.*
exit 0

