#!/usr/bin/env perl
#
#   An utility from Pheno-Ranker to convert a CSV to:
#
#   1 - Input file (JSON array of objects)
#   2 - Configuration file (needed for Pheno-Ranker)
#
#   Last Modified: Mar/21/2024
#
#   $VERSION taken from Pheno::Ranker
#
#   Copyright (C) 2023-2024 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
#
#   License: Artistic License 2.0
#
#   If this program helps you in your research, please cite.

use strict;
use warnings;
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;

### Main #####
process_csv();
##############
exit;

sub process_csv {

    my $VERSION   = '0.08';
    my $array_sep = '\|';

    # Reading arguments
    GetOptions(
        'input|i=s'            => \my $input,                                  # string
        'primary-key-name=s'   => \my $primary_key_name,                       # string
        'generate-primary-key' => \my $generate_primary_key,                   # flag
        'separator|sep=s'      => \my $sep,                                    # str
        'array-separator=s'    => \$array_sep,                                 # str
        'output-dir=s'         => \my $output_dir,                             # str
        'help|?'               => \my $help,                                   # flag
        'man'                  => \my $man,                                    # flag
        'debug=i'              => \my $debug,                                  # integer
        'verbose|'             => \my $verbose,                                # flag
        'version|V'            => sub { print "$0 Version $VERSION\n"; exit; }
    ) or pod2usage(2);
    pod2usage(1)                              if $help;
    pod2usage( -verbose => 2, -exitval => 0 ) if $man;
    pod2usage(
        -message =>
          "Please specify a valid CSV|TSV input with --i <file.csv>\n",
        -exitval => 1
    ) unless ( $input && $input =~ m/\.(csv|tsv)$/ && -f $input );
    pod2usage(
        -message => "Please specify a --primary-key-name <my_string>\n",
        -exitval => 1
    ) if ( $generate_primary_key && !$primary_key_name );
    pod2usage(
        -message => "Please specify a valid directory for --output-dir\n",
        -exitval => 1
    ) if ( defined $output_dir && !-d $output_dir );

    # Create object
    my $csv = CSV2PhenoRanker->new(
        {
            input                => $input,
            primary_key_name     => $primary_key_name,
            generate_primary_key => $generate_primary_key,
            output_dir           => $output_dir,
            sep                  => $sep,
            array_sep            => $array_sep,
            debug                => $debug,
            verbose              => $verbose
        }
    );

    # Run method
    $csv->run;
}

package CSV2PhenoRanker;

use strict;
use warnings;
use autodie;
use feature qw(say);
use Data::Dumper;
use Path::Tiny;
use File::Basename;
use File::Spec::Functions qw(catdir catfile);
use YAML::XS qw(LoadFile DumpFile);
use JSON::XS;
use Text::CSV_XS;

sub new {
    my ( $class, $self ) = @_;
    bless $self, $class;
    return $self;
}

sub run {

    my $self = shift;

    # Read the input file
    my ( $data, $arrays, $non_arrays ) = $self->read_csv();

    # Define output directory
    my ( $name, $path, undef ) = fileparse( $self->{input}, qr/\.[^.]*/ );
    my $output_dir = $self->{output_dir} // $path;    # Use defined-or operator

    # Write data as JSON
    my $json_file = catfile( $output_dir, qq/${name}.json/ );
    say "Writting <$json_file> " if $self->{verbose};
    write_json( { filepath => $json_file, data => $data } );

    # Load the configuration file data
    my $config = $self->create_config( $arrays, $non_arrays );

    # Write the configuration file as YAML
    my $yaml_file = catfile( $output_dir, qq/${name}_config.yaml/ );
    say "Writting <$yaml_file> " if $self->{verbose};
    write_yaml( { filepath => $yaml_file, data => $config } );
    return 1;
}

sub write_json {

    my $arg       = shift;
    my $file      = $arg->{filepath};
    my $json_data = $arg->{data};

    # Note that canonical DOES not match the order of nsort from Sort::Naturally
    my $json = JSON::XS->new->utf8->canonical->pretty->encode($json_data);
    path($file)->spew($json);
    return 1;
}

sub write_yaml {

    my $arg       = shift;
    my $file      = $arg->{filepath};
    my $json_data = $arg->{data};
    local $YAML::XS::Boolean = 'JSON::PP';
    DumpFile( $file, $json_data );
    return 1;
}

sub read_csv {

    my $self                 = shift;
    my $input                = $self->{input};
    my $primary_key_name     = $self->{primary_key_name};       # has to be non-array
    my $generate_primary_key = $self->{generate_primary_key};
    my $sep                  = $self->{sep};
    my $array_sep            = $self->{array_sep};
    my $array_sep_qr         = qr/$array_sep/;

    # Create a Text::CSV object with semicolon as the separator
    my $csv = Text::CSV_XS->new(
        {
            binary    => 1,
            sep_char  => define_separator( $input, $sep ),
            auto_diag => 1
        }
    );

    # Open filehandle
    open my $fh, '<:encoding(utf-8)', $input;

    # Parse the CSV data
    my $headers = $csv->getline($fh);

    # Get rid of problematic characters on headers
    $_ =~ tr/()//d for @$headers;

    # Add $primary_key_name to headers if $generate_primary_key
    if ($generate_primary_key) {

        # Check that primary_key_name does not exist
        die
"Primary key <$primary_key_name> already exists. Are you sure you need the <--generate-primary-key> flag?\n"
          if ( grep { $_ eq $primary_key_name } @$headers );

        # Make it last element of the array
        push @$headers, $primary_key_name if $generate_primary_key;
    }

    #####################
    # START READING CSV #
    #####################

    my ( @rows, %array, %non_array );
    my $count = 1;
    say "Start reading <$input>" if $self->{verbose};
    while ( my $row = $csv->getline($fh) ) {

        # Print if verbose
        say "Reading row $count..."
          if ( $self->{verbose} && $count % 1_000 == 0 );

        # Add id if $generate_primary_key
        push @$row, 'PR_' . sprintf( "%08d", $count ) if $generate_primary_key;

        # Load data
        my %data;
        @data{@$headers} = @$row;

        # *** IMPORTANT ***
        # Columns can consist of arrays or strings
        # Here we load all as strings and we re-format array fields a posteriori

        for my $key ( keys %data ) {

            # Check array/non-array based on regex
            $array{$key}++ if $data{$key} =~ $array_sep_qr;
        }

        push @rows, \%data;
        $count++;
    }
    close $fh;
    say "Reading <$input> completed!" if $self->{verbose};

    ###################
    # END READING CSV #
    ###################

    # Load array/non-array
    my @array = keys %array;

    # Filter the original array to exclude elements found in the @array
    my @non_array = grep { !$array{$_} } @$headers;

    # Re-arrange array variables
    split_array_fields( \@rows, \@array, $array_sep );

    return ( \@rows, \@array, \@non_array );
}

sub split_array_fields {

    my ( $rows, $array, $array_sep ) = @_;

    # Split array fields (comma-separated values) into an array_ref
    # Modify the original data structure directly
    for my $row (@$rows) {
        for my $key (@$array) {
            $row->{$key} = [ split /$array_sep/, $row->{$key} ];
        }
    }
    return 1;
}

sub create_config {

    my ( $self, $array, $non_array ) = @_;
    my $primary_key_name = $self->{primary_key_name};

    my @arrays     = sort @$array;
    my @non_arrays = sort @$non_array;

    # Convert arrays to hashes for quick look up
    my %arrays_hash     = map { $_ => 1 } @arrays;
    my %non_arrays_hash = map { $_ => 1 } @non_arrays;

    # Set primary key
    if ( defined $primary_key_name ) {
        die "Primary-key <$primary_key_name> not found\n"
          unless ( exists $arrays_hash{$primary_key_name}
            || exists $non_arrays_hash{$primary_key_name} );
    }

    else {
        $primary_key_name = exists $non_arrays_hash{id} ? 'id' : $non_arrays[0];
    }

    # Default for non-arrays
    my $config = {
        format        => 'CSV',
        primary_key   => $primary_key_name,
        allowed_terms => [@non_arrays]
    };

    # Update for arrays
    if ( scalar @arrays ) {

        # NB: Can't use $array more than once in the hash ref below. Need to deref
        $config->{array_terms} = [@arrays];

        # @non_arrays, @arrays are sorted, but if we merge them we need to re-sort
        $config->{allowed_terms}     = [ sort @non_arrays, @arrays ];
        $config->{id_correspondence} = {
            CSV => [
                map {
                    my $val = { $_ => $_ };
                    $val
                } @arrays
            ]
        };

    }
    return $config;
}

sub define_separator {

    my ( $filepath, $sep ) = @_;

    # Define split record separator from file extension
    my @exts = qw(.csv .tsv);
    my ( $undef, undef, $ext ) = fileparse( $filepath, @exts );

    # Defining separator character
    my $separator =
        defined($sep)  ? $sep
      : $ext eq '.csv' ? ','
      :                          # Use comma for csv files
      $ext eq '.tsv' ? "\t" :    # Use tab for tsv files
      ',';                       # Default to comma if no extension match or $sep is undefined

    # Return separator
    return $separator;
}

1;

=head1 NAME

csv2pheno-ranker: A script to convert a CSV to an input suitable for Pheno-Ranker

=head1 SYNOPSIS

csv2pheno-ranker -i <input.csv> [-options]

     Arguments:
       -i, --input <input.csv>        CSV file

     Options:
       -generate-primary-key          Generates a primary key if absent. Use --primary-key-name to set its name
       -primary-key-name <name>       Sets the name for the primary key. Must be a single, non-array field
       -sep, --separator <char>       Delimiter for CSV fields [;] (e.g., --sep $'\t' for tabs)
       -array-separator <char>        Delimiter for nested arrays [|] (e.g., --array-separator ';' for semicolons)
       -output-dir <directory>        Specify the directory where output files will be stored. If not specified, outputs will be placed in the same directory as the input file

     Generic Options:
       -debug <level>                 Print debugging (from 1 to 5, being 5 max)
       -h, --help                     Brief help message
       -man                           Full documentation
       -v, --verbose                  Verbosity on
       -V, --version                  Print version


=head1 DESCRIPTION

Numerous tools exist for CSV to JSON conversion, but our focus here was on creating JSON specifically for C<Pheno-Ranker>. The script supports both basic CSV files and complex, comma-separated CSV files with nested fields, ensuring seamless C<Pheno-Ranker> integration.

The script will create both a JSON file and the configuration file for C<Pheno-Ranker>. Then, you can run C<Pheno-Ranker> as:

 $ pheno-ranker -r my_csv.json --config --my_csv_config.yaml

Note that we load all data in memory before dumping the JSON file. If you have a huge CSV (e.g.,>5M rows) please use a computer that has enough RAM.

=head1 SUMMARY

A script to convert a CSV to an input suitable for C<Pheno-Ranker>

=head1 INSTALLATION

(only needed if you did not install C<Pheno-Ranker>)

 $ cpanm --sudo --installdeps .

=head3 System requirements

  * Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOs, OpenSuse) should do as well.
  * Perl 5 (>= 5.10 core; installed by default in most Linux distributions). Check the version with "perl -v"
  * 1GB of RAM.
  * 1 core (it only uses one core per job).
  * At least 1GB HDD.

=head1 HOW TO RUN CSV2PHENO-RANKER

The software requires a CSV file as the input and operates with default settings. By default, both the C<JSON> file and the configuration file will be created in the same directory as the input file, and will share the same basename.

If you have columns with nested values make sure that you use C<--array-separator> to define the delimiting character (default is "|").

If you want to change some parameters please take a look to the synopsis.

B<Examples:>

 $ ./csv2pheno-ranker -i example.csv
 
 $ ./csv2pheno-ranker -i example.csv --generate-primary-key --primary-key-name ID

 $ ./csv2pheno-ranker -i example.csv --generate-primary-key --primary-key-name ID  --output-dir /my-path --sep ';' --array-separator ','

=head2 COMMON ERRORS AND SOLUTIONS

 * Error message: Foo
   Solution: Bar

 * Error message: Foo
   Solution: Bar

=head1 AUTHOR 

Written by Manuel Rueda, PhD. Info about CNAG can be found at L<https://www.cnag.eu>.

=head1 COPYRIGHT AND LICENSE

This PERL file is copyrighted. See the LICENSE file included in this distribution.

=cut
