#!/usr/bin/env perl
# ABSTRACT: uses MinHash & SpeedyFx to compare large text data
# PODNAME: minhash_cmp
use strict;
use utf8;
use warnings;

our $VERSION = '0.004'; # VERSION


use File::Slurp;
use Getopt::Long;
use List::MoreUtils qw(distinct);
use Pod::Usage;
use Text::SpeedyFx;

GetOptions(
    q(help)             => \my $help,
    q(binmode=s)        => \my $binmode,
    q(epsilon=f)        => \my $e,
    q(k=i)              => \my $k,
    q(seed=i)           => \my $seed,
) or pod2usage(q(-verbose) => 1);
pod2usage(q(-verbose) => 1)
    if $help or $#ARGV != 1;

$e //= 0.05;
$k //= 0 + sprintf
    q(%0.0f),
    1 / ($e ** 2);

my $hashes = [];
srand($seed // 0x4c53_4820);
push @{$hashes}, Text::SpeedyFx->new(int rand 2 ** 32)
    for 1 .. $k;

my @text = map {
    '' . read_file(
        $_,
        { binmode => $binmode // q(:utf8) }
    )
} @ARGV;

my $match = 0;
for my $hash (@{$hashes}) {
    ++$match
        if 1 == distinct
            map {
                $hash->hash_min($_);
            } @text;
}

printf qq(k=%d; similarity=%0.5f\n), $k, $match / $k;

__END__
=pod

=encoding utf8

=head1 NAME

minhash_cmp - uses MinHash & SpeedyFx to compare large text data

=head1 VERSION

version 0.004

=head1 SYNOPSIS

    minhash_cmp [options] FILE1 FILE2

=head1 DESCRIPTION

MinHash (or the min-wise independent permutations locality sensitive hashing scheme) is a technique for quickly estimating how similar two sets are.

=head1 OPTIONS

=over 4

=item --help

This.

=item --binmode

You can use this to set the file to be read in binary mode (C<:raw>), C<:utf8>, etc.
Default: C<:utf8>

=item --epsilon

Expected error value used to compute the number of different hash functions (default: 0.05).

=item --k

Number of different hash functions to use (default: 400; overrides C<--epsilon>).

=item --seed

Custom seed (integer).

=back

=head1 CAVEATS

Uses B<MANY RAM>!!!
Each initialized hash function wastes ~2MB.

=head1 SEE ALSO

=over 4

=item *

L<MinHash|http://en.wikipedia.org/wiki/MinHash>

=item *

L<Text::SpeedyFx>

=back

=head1 AUTHOR

Stanislaw Pusep <stas@sysd.org>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2012 by Stanislaw Pusep.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut

