#!/usr/bin/perl

use v5.14;
use utf8;
use strictures 2;
use warnings qw(FATAL utf8);
use charnames qw(:full :short);
no warnings "nonchar";
no warnings "surrogate";
no warnings "non_unicode";
use autodie;

use version;
use Getopt::Long::Descriptive;
my $COPYRIGHT;
use Pod::Constants
	-trim => 1,
	'COPYRIGHT AND LICENSE' => sub { ($COPYRIGHT) = s/C<< (.*) >>/$1/gr };
use Path::Tiny;
use Try::Tiny;
use String::Escape qw(unbackslash);

use App::Licensecheck;

=head1 NAME

licensecheck - simple license checker for source files

=head1 VERSION

Version v3.0.4

=cut

our $VERSION = version->declare("v3.0.4");

binmode STDOUT, ':utf8';

my $progname = path($0)->basename;

=head1 SYNOPSIS

B<licensecheck> B<--help>|B<--version>

B<licensecheck> [B<options>...] F<path> [F<path>...]

=head1 DESCRIPTION

B<licensecheck> attempts to determine the license that applies to each file
passed to it, by searching the start of the file for text belonging to
various licenses.

If any of the arguments passed are directories, B<licensecheck> will add
the files contained within to the list of files to process.

=cut

my ( $opt, $usage ) = describe_options(
	'%c %o path [path...]',
	[],
	[   'check|c=s', 'regular expression of files to include',
		{ default => 'common source files' }
	],
	[   'ignore|i=s', 'regular expression of files to skip',
		{ default => 'some backup and VCS files' }
	],
	[ 'recursive|r', 'traverse directories recursively' ],
	[],
	[   'lines|l=i',
		'number of lines to parse from top of each file; set to 0 to parse the whole file (and ignore --tail option), which is often faster',
		{ default => 60 }
	],
	[   'tail=i',
		'number of bytes to parse from bottom of each file; set to 0 to avoid parsing from end of file',
		{ default => 5000 }
	],
	[   'encoding|e=s',
		'try decode source files as specified codec, with latin1 as fallback; by default no decoding is done'
	],
	[],
	[ 'verbose!',  'add header of each file to license information' ],
	[ 'copyright', 'add copyright statements to license information' ],
	[   'skipped|s',
		'print skipped files to STDERR, i.e. files matching --check option but not --ignore option.'
	],
	[ 'deb-fmt!', 'use Debian license shortnames (based on SPDX keywords)' ],
	[   'machine|m',
		'print license information as TAB-separated fields, for processing with line-oriented tools like awk and sort (NB! the --verbose option will kill the readability)'
	],
	[   'deb-machine!',
		'print license information like a Debian copyright file'
	],
	[   'list-delimiter=s',
		'printf-string used between multiple plain list items in Debian copyright file',
		{ default => '\n  ' }
	],
	[   'rfc822-delimiter=s',
		'printf-string used between multiple RFC822-style items in Debian copyright file',
		{ default => '\n   ' }
	],
	[   'merge-licenses!',
		'merge same-licensed files in Debian copyright file'
	],
	[ 'text|t',         '', { hidden => 1 } ],
	[ 'noconf|no-conf', '', { hidden => 1 } ],
	[],
	[ 'help|h', 'print help message and exit', { shortcircuit => 1 } ],
	[   'version|v', 'print version and copyright information and exit',
		{ shortcircuit => 1 }
	],
	{   getopt_conf   => [qw(gnu_getopt)],
		show_defaults => 1,
	},
);

print( $usage->text ), exit if $opt->help;
if ( $opt->version ) { version(); exit 0; }

# From dpkg-source
my $default_ignore_regex = q!
	# Ignore general backup files
	~$|
	# Ignore emacs recovery files
	(?:^|/)\.#|
	# Ignore vi swap files
	(?:^|/)\..*\.swp$|
	# Ignore baz-style junk files or directories
	(?:^|/),,.*(?:$|/.*$)|
	# File-names that should be ignored (never directories)
	(?:^|/)(?:DEADJOE|\.cvsignore|\.arch-inventory|\.bzrignore|\.gitignore)$|
	# File or directory names that should be ignored
	(?:^|/)(?:CVS|RCS|\.pc|\.deps|\{arch\}|\.arch-ids|\.svn|\.hg|_darcs|\.git|
	\.shelf|_MTN|\.bzr(?:\.backup|tags)?)(?:$|/.*$)
!;

my $default_check_regex = q!
	/[\w-]+$ # executable scripts or README like file
	|\.( # search for file suffix
		c(c|pp|xx)? # c and c++
		|h(h|pp|xx)? # header files for c and c++
		|S
		|css|less # HTML css and similar
		|f(77|90)?
		|go
		|groovy
		|lisp
		|scala
		|clj
		|p(l|m)?6?|t|xs|pod6? # perl5 or perl6
		|sh
		|php
		|py(|x)
		|rb
		|java
		|js
		|vala
		|el
		|sc(i|e)
		|cs
		|pas
		|inc
		|dtd|xsl
		|mod
		|m
		|md|markdown
		|tex
		|mli?
		|(c|l)?hs
	)$
!;

my $ignore_regex;
for ( $opt->ignore ) {
	if ( length and $_ ne 'some backup and VCS files' ) {
		$ignore_regex = qr/$_/;
	}
	else {
		$ignore_regex = qr/$default_ignore_regex/x;
	}
}

my $check_regex;
for ( $opt->check ) {
	if ( length and $_ ne 'common source files' ) {
		$check_regex = qr/$_/;
	}
	else {
		$check_regex = qr/$default_check_regex/x;
	}
}

if ( $opt->text ) {
	warn "$0 warning: option -text is deprecated\n";   # remove -text end 2015
}
if ( $opt->noconf ) {
	warn "$0 warning: option --no-conf is deprecated\n";    # No-op
}

print( "$progname: No paths provided.\n", $usage->leader_text ), exit 2
	unless @ARGV;

my $app = App::Licensecheck->new(
	lines   => $opt->lines,
	tail    => $opt->tail,
	verbose => $opt->verbose,
	deb_fmt => $opt->deb_fmt,
);

my @files       = ();
my @find_args   = ();
my $files_count = @ARGV;

push @find_args, qw(-maxdepth 1) unless $opt->recursive;
push @find_args, qw(-follow -type f -print);

while (@ARGV) {
	my $file = shift @ARGV;

	if ( -d $file ) {
		open my $FIND, '-|', 'find', $file, @find_args
			or die "$progname: couldn't exec find: $!\n";

		while ( my $found = <$FIND> ) {
			chomp($found);

			# Silently skip empty files or ignored files
			next if -z $found or $found =~ $ignore_regex;
			if ( $found =~ $check_regex ) {

				# Silently skip empty files or ignored files
				push @files, $found;
			}
			else {
				warn "skipped file $found\n" if $opt->skipped;
			}
		}
		close $FIND;
	}
	elsif ( $file =~ $ignore_regex ) {

		# Silently skip ignored files
		next;
	}
	elsif ( $files_count == 1 or $file =~ $check_regex ) {
		push @files, $file;
	}
	else {
		warn "skipped file $file\n" if $opt->skipped;
	}
}

if ( $opt->deb_machine ) {
	print
		"Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n";
	print "Upstream-Name: FIXME\n";
	print "Upstream-Contact: FIXME\n";
	print "Source: FIXME\n";
	print "Disclaimer: Autogenerated by licensecheck\n\n";
}

my %patternfiles;
my %patternownerlines;
my %patternlicense;

while (@files) {
	my $file = shift @files;
	my ( $license, $copyright );

	if ( $opt->encoding ) {
		try {
			$app->encoding( $opt->encoding );
			( $license, $copyright ) = $app->parse($file);
		}
		catch {
			if (/does not map to Unicode/) {
				print
					"file $file cannot be read with $opt->encoding; encoding, will try latin-1:\n$_"
					if $opt->verbose;
				try {
					$app->encoding('latin-1');
					( $license, $copyright ) = $app->parse($file);
				}
				catch {
					if (/does not map to Unicode/) {
						print
							"file $file cannot be read with latin-1; encoding, will try binary:\n$_"
							if $opt->verbose;
						( $license, $copyright ) = $app->parse($file);
					}
					else {
						die $_;
					}
				}
			}
			else {
				die $_;
			}
		}
	}
	else {
		( $license, $copyright ) = $app->parse($file);
	}

	if ( $opt->deb_machine ) {
		my @ownerlines       = grep {/\w\w/} split /\s\/\s/, $copyright;
		my @ownerlines_clean = ();
		my %owneryears       = ();
		my $owneryears_seem_correct = 1;
		for my $ownerline (@ownerlines) {
			my ( $owneryear, $owner ) = $ownerline =~ /^([\d\-,\s]*)\s*+(.*)/;
			$owneryears_seem_correct = 0 unless ($owneryear);
			$owner =~ s/^by\s+//;
			$owner =~ s/,?\s+All Rights Reserved\.?//gi;
			push @ownerlines_clean, "$owneryear$owner";
			push @{ $owneryears{"$owner"} }, $owneryear;
		}
		my @owners = sort keys %owneryears;
		@owners = ()
			if ( $opt->merge_licenses and $owneryears_seem_correct );
		my $pattern = join( "\n", $license, @owners );
		push @{ $patternfiles{"$pattern"} },      $file;
		push @{ $patternownerlines{"$pattern"} }, @ownerlines_clean;
		$patternlicense{"$pattern"} = $license;
	}
	elsif ( $opt->machine ) {
		print "$file\t$license";
		print "\t" . ( $copyright or "*No copyright*" ) if $opt->copyright;
		print "\n";
	}
	else {
		print "$file: ";
		print "*No copyright* " unless $copyright;
		print $license . "\n";
		print "  [Copyright: " . $copyright . "]\n"
			if $copyright and $opt->copyright;
		print "\n" if $opt->copyright;
	}
}

if ( $opt->deb_machine ) {
	foreach my $pattern (
		sort {
			@{ $patternfiles{$b} } <=> @{ $patternfiles{$a} }
				|| $a cmp $b
		} keys %patternfiles
		)
	{
		my $prev;
		my @ownerlines_unique
			= grep( ( !defined $prev || $_ ne $prev ) && ( ($prev) = $_ ),
			sort @{ $patternownerlines{$pattern} } );
		print "Files: ",
			join(
			unbackslash( $opt->list_delimiter, ),
			sort @{ $patternfiles{$pattern} }
			),
			"\n";
		print "Copyright: ",
			join(
			unbackslash( $opt->rfc822_delimiter, ),
			@ownerlines_unique
			),
			"\n";
		print "License: $patternlicense{$pattern}\n FIXME\n\n";
	}
}

sub version
{
	print <<"EOF";
This is $progname version $VERSION

$COPYRIGHT
EOF
}

=encoding UTF-8

=head1 AUTHOR

Jonas Smedegaard C<< <dr@jones.dk> >>

=head1 COPYRIGHT AND LICENSE

This program is based on the script "licensecheck" from the KDE SDK,
originally introduced by Stefan Westerfeld C<< <stefan@space.twc.de> >>.

  Copyright © 2007, 2008 Adam D. Barratt

  Copyright © 2012 Francesco Poli

  Copyright © 2016 Jonas Smedegaard

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3, or (at your option) any
later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program. If not, see <https://www.gnu.org/licenses/>.

=cut
