package File::FormatIdentification::RandomSampling;
# ABSTRACT: methods to identify files using random sampling
our $VERSION = '0.001'; # TRIAL VERSION:
# (c) 2020 by Andreas Romeyke
# licensed via GPL v3.0 or later
use strict;
use warnings;
use feature qw(say);
use Moose;

has 'bytegram' => (
    is      => 'rw',
    isa     => 'ArrayRef',
    default => sub {[]},
);

sub init_bytegrams {
    my $self = shift;
    my $bytegram_ref = $self->{'bytegram'};
    $bytegram_ref->[0] = [(0) x 256]; # onegram
    $bytegram_ref->[1] = [(0) x 65536]; #bigram
    return 1;
}

sub BUILD {
    my $self = shift;
    $self->init_bytegrams();
    return 1;
}

sub update_bytegram {
    my $self = shift;
    my $buffer = shift;
    if (defined $buffer) {
        my $bytegram_ref = $self->{'bytegram'};
        my @bytes = unpack "C*", $buffer;
        my @words = unpack "S*", $buffer;
        #    my @bytes = map{ ord($_)} split //, $buffer;
        if (scalar @bytes > 0) {
            my @onegram = @{$bytegram_ref->[0]};
            my @bigram = @{$bytegram_ref->[1]};
            foreach my $byte (@bytes) {
                $onegram[$byte]++;
            }
            foreach my $word (@words) {
                $bigram[$word]++;
            }
            $bytegram_ref->[0] = \@onegram;
            $bytegram_ref->[1] = \@bigram;
        }
    }
    return 1;
}

sub calc_histogram { # use only the most significant first 8 entries
    my $self = shift;
    my $bytegram_ref = $self->{'bytegram'};
    my @bytes_sorted = sort {$bytegram_ref->[0]->[$b] <=> $bytegram_ref->[0]->[$a]} (0..255);
    my @words_sorted = sort {$bytegram_ref->[1]->[$b] <=> $bytegram_ref->[1]->[$a]} (0 .. 65535);
    # show only 8 most onegrame bytes
    my @bytes_truncated = @bytes_sorted[0..7];
    my @words_truncated = @words_sorted[0..7];
    my %histogram;
    foreach my $byte (@bytes_truncated) {
        push @{$histogram{onegram}}, $byte; #$bytegram_ref->[0]->[$byte];
    }
    foreach my $word (@words_truncated) {
        push @{$histogram{bigram}}, $word; #$bytegram_ref->[1]->[$word];
    }
    return \%histogram;
}

sub is_uniform {
    my $self = shift;
    #say "is_uniform?";
    my $bytegram_ref = $self->{'bytegram'};
    my $sum = 0;
    my $n = 0;
    my @unigram = @{$bytegram_ref->[0]};
    foreach my $byte (0 .. 255) {
        if ($unigram[$byte] > 0) {
            $n +=  $unigram[$byte];
            $sum += ($unigram[$byte] * $byte);
        }
    }
    if ($n == 0) { return;}
    my $expected = (256)/2;
    my $mean = ($sum/$n);
    #say "expected=$expected, sum=$sum, mean=$mean";
    return (abs($expected - $mean) < 4);
}

sub is_empty {
    my $self = shift;
    #say "is_empty?";
    my $bytegram_ref = $self->{'bytegram'};
    my $sum = 0;
    my $n = 0;
    my @unigram = @{$bytegram_ref->[0]};
    foreach my $byte (0 .. 255) {
        if ($unigram[$byte] > 0) {
            $n   += $unigram[$byte];
            $sum += ($unigram[$byte] * $byte);
        }
    }
    if ($n == 0) { return;}
    my $expected = 0;
    my $mean = ($sum/$n);
    # say "expected=$expected, mean=$mean";
    my $criteria = abs($expected - $mean) < 4;
    return ( $criteria);
}

sub is_text {
    my $self = shift;
    #say "is_text?";
    my $bytegram_ref = $self->{'bytegram'};
    # many Bytes in range 32 .. 173
    my $printable = 0;
    my $non_printable = 0;
    my @unigram = @{$bytegram_ref->[0]};
    foreach my $byte (0 .. 255) {
        #say "bytegram[$byte] = ". $bytegram_ref->[0]->[$byte];
        if ($unigram[$byte] > 0) {
            if (($byte >= 32) && ($byte <= 173)) {
                $printable += ($unigram[$byte]);
            }
            else {
                $non_printable += ($unigram[$byte]);
            }
        }
    }
    my $ratio = $printable / ($printable + $non_printable + 1); # +1 to avoid division by zero
    #say "ratio text = $ratio (print=$printable, nonprint=$non_printable";
    return ($ratio > 0.9);
}

sub is_video { # quicktime
    my $self = shift;
    #say "is_video?";
    my $bytegram_ref = $self->{'bytegram'};
    # many Bytes with 0x6d, ratio > 1/256 per read Byte
    my $mp_indicator = 0;
    my $other = 0;
    my @unigram = @{$bytegram_ref->[0]};
    # MPEG-TS: Synchrobyte = 0x47 5times with distance of 188bytes
    # MP4/Quicktime: Atom 'mvhd'
    # General: 0x6d value
    foreach my $byte ( 0 .. 255) {
        if ($unigram[$byte] > 0) {
            if ($byte != 0x6d) {
                $other += $unigram[$byte];
            } else { # $byte = 0x6d
                $mp_indicator += $unigram[$byte];
            }
        }
    }
    my $ratio = $mp_indicator / ($mp_indicator + $other + 1); # +1 to avoid division by zero
    #say "ratio=$ratio ($mp_indicator / ".($mp_indicator + $other + 1).") 47=", chr(0x47);
    return ($ratio > 2/256);
}


sub calc_type {
    my $self = shift;
    my $buffer = shift;

    $self->init_bytegrams();
    $self->update_bytegram($buffer);

    if ($self->is_empty()) {
        return "empty";
    }
    elsif ($self->is_text()) {
        return "text";
    }
    elsif ($self->is_video()) {
        return "video/audio";
    }
    elsif ($self->is_uniform()) {
        return "random/encrypted/compressed";
    }
    return "undef";
}

no Moose;

__PACKAGE__->meta->make_immutable;

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

File::FormatIdentification::RandomSampling - methods to identify files using random sampling

=head1 VERSION

version 0.001

=head1 AUTHOR

Andreas Romeyke <pause@andreas-romeyke.de>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2020 by Andreas Romeyke.

This is free software, licensed under:

  The GNU General Public License, Version 3, June 2007

=cut
