#!/usr/bin/env perl
# License: Public Domain or CC0
# See https://creativecommons.org/publicdomain/zero/1.0/
# The author, Jim Avera (jim.avera at gmail) has waived all copyright and
# related or neighboring rights.  Attribution is requested but is not required.
use strict; use warnings FATAL => 'all'; use 5.010;

our $VERSION = '1.016'; # VERSION
our $DATE = '2023-06-28'; # DATE

#use 5.12; # for unicode_strings
use v5.18; # for lexical_subs

use strict; use warnings;
use feature qw(switch state say lexical_subs fc);
use feature qw(unicode_strings unicode_eval evalbytes);
no warnings "experimental::lexical_subs";

use Carp; $Carp::MaxArgNums = 0;
use Data::Dumper::Interp;
use File::Basename qw(basename dirname fileparse);
use File::Path qw(make_path remove_tree);
use File::Temp qw(tempfile tempdir);
use File::Copy ();
use File::Spec;
use File::Spec::Functions qw(canonpath catfile catdir rootdir tmpdir);
use FindBin qw/$Bin $Script/;
use Getopt::Long qw/GetOptions/;
use Guard qw(guard scope_guard);
use List::Util qw/min max any first/;
use List::MoreUtils qw/indexes/;
use Scalar::Util qw/openhandle/;
use Pod::Usage qw/pod2usage/;
use Spreadsheet::Edit qw(title2ident);
use Spreadsheet::Edit::IO qw/convert_spreadsheet
             sheetname_from_spec filepath_from_spec form_spec_with_sheetname/;
use Term::ReadKey ();
use Encode 3.00 qw/decode/; # for ONLY_PRAGMA_WARNINGS
require PerlIO;
sub oops(@) { unshift @_, "oops "; require Carp; goto &Carp::confess; }
#$SIG{__WARN__} = sub{ Carp::cluck @_ };

sub main::Differ::compile_if_regex(@); #forward

# Replace invalid/undesirable filename characters with underscore
sub sanitize_filename(_) { local $_ = shift; s/[^-._[:word:]]/_/g; s/_$//; $_ }

use utf8;

my ($visible_space, $RArrow);

# By default set output encoding to match the user's terminal/locale,
# and suppress the "...does not mapt to..." warnings if unsupported characters
# are displayed e.g. in spreadsheet data diff displays (\x{hex} escapes will
# be displayed for un-encodeable characters).
# May be overridden by the --output-encoding option!

# https://rt.cpan.org/Public/Bug/Display.html?id=88592
$PerlIO::encoding::fallback |= Encode::ONLY_PRAGMA_WARNINGS;
no warnings 'utf8';
use open ':std', ':locale';

select STDERR; $| = 1; select STDOUT; $| = 1;

my $stdout_encoding;
sub decode_foruser($) {
  my $octets = shift;
  $stdout_encoding
    ? decode($stdout_encoding, $octets, Encode::FB_DEFAULT|Encode::LEAVE_SRC)
    : $octets
}
sub encode_foruser($) {
  my $chars = shift;
  $stdout_encoding
    ? encode($stdout_encoding, $chars, Encode::FB_DEFAULT|Encode::LEAVE_SRC)
    : $chars
}

#-------- Get Arguments ---------------
sub call_pod2usage {
  confess "bug" if (scalar(@_) % 2) != 0;
  my %opts = @_;
#  if (! $opts{-msg}) {
#    if (my $podpath = pod_where({-inc => 1},"App::diff_spreadsheets")) {
#      $opts{-input} = $podpath;
#    } else {
#      warn "Could not find App::diff_spreadsheets in \@INC\n",
#           "\@INC:\n", join("\n   ",@INC), "\n";
#    }
#  }
  pod2usage(\%opts);
}

sub badargs_exit(@) {
  call_pod2usage(-output => \*STDERR, -exitval => 2, @_);
}

# We could use Term::Encoding to detect the terminal's encoding
# but that would create a possibly-undesirable dependency.
# We just assume UTF-8, which these days is probably correct.
#my $rightarrow = '->';
#my $rightarrow = "\N{RIGHTWARDS ARROW}\N{THIN SPACE}";
#my $rightarrow = "\N{RIGHTWARDS ARROW}\N{NARROW NO-BREAK SPACE}";
#my $rightarrow = "\N{RIGHTWARDS ARROW}\N{HAIR SPACE}";

# 0 or undef to not wrap long lines
my $maxwidth = do{
  u($ENV{COLUMNS}) =~ /^[1-9]\d*$/
  ? # Overrides actual terminal width; needed for testing
    $ENV{COLUMNS}
  : do{
      my $wmsg = ""; # Suppress hard-coded "didn't work" from Term::ReadKey
                     #  or "stty: standard input: Not a tty" etc.
      local $SIG{'__WARN__'} = sub { $wmsg .= $_[0] };
      my ($width, $height) = Term::ReadKey::GetTerminalSize(*STDOUT);
      #warn $wmsg if $wmsg && $wmsg !~ /did.*n.*work/i;
      $width // 80
    }
};

my %opts = (
  sort_rows       => 1,
  quote_char      => '"',
  sep_char        => ',',
  encoding        => 'UTF-8',
  trunc_title_width => $maxwidth > 60 ? int($maxwidth/3) : 20,
);
my $method = "native";
my @diff_opts;
my $help;

### MAIN ###

badargs_exit(-msg => "Usage: $Script file1 file2\n   -h for help") if @ARGV==0;
Getopt::Long::Configure ("default", "gnu_getopt", "auto_version");
GetOptions(
   "always-show-columns=s"     => sub{ push @{$opts{always_show_columns}}, $_[1] },
   "a|text"                    => sub{ push @diff_opts, "-$_[0]"; },
   "B|ignore-blank-lines"      => sub{ push @diff_opts, "-$_[0]"; },
   "b|ignore-space-change"     => sub{ push @diff_opts, "-$_[0]"; },

   # --color is valid only for 'git diff'
   "color=s"                   => sub{ push @diff_opts, "-$_[0]=$_[1]"; },

   "columns=s"                 => sub{ push @{$opts{columns}}, $_[1] },
   "c|C|context=i"             => sub{ push @diff_opts, "-$_[0]", $_[1]; },
   "debug"                     => sub{ $opts{debug} = $_[1] },
   "D|ifdef=s"                 => sub{ push @diff_opts, "-$_[0]", $_[1]; },
   "d|minimal"                 => sub{ push @diff_opts, "-$_[0]"; },
   "encoding=s"                => sub{ $opts{encoding} = $_[1] },
   "e|ed"                      => sub{ push @diff_opts, "-$_[0]"; },
   "E|ignore-tab-expansion"    => sub{ push @diff_opts, "-$_[0]"; },
   "first-data-row=i"          => sub{ $opts{first_data_row} = $_[1] },
   "F|show-function-line=s"    => sub{ push @diff_opts, "-$_[0]", $_[1]; },
   "GTYPE-group-format=s"      => sub{ push @diff_opts, "-$_[0]", $_[1]; },
   "hash-func=s"               => sub{ $opts{hash_func} = $_[1] },
   "hashid-func=s"             => sub{ $opts{hashid_func} = $_[1] },
   "horizon-lines=s"           => sub{ push @diff_opts, "-$_[0]", $_[1]; },
   "h|help"                    => \$help,
   "id-columns=s"              => sub{ push @{$opts{id_columns}}, $_[1] },
   "i|ignore-case"             => sub{ push @diff_opts, "-$_[0]"; },
   "I|ignore-matching-lines=s" => sub{ push @diff_opts, "-$_[0]", $_[1]; },
   "keep-temps!"               => sub{ $opts{keep_temps} = $_[1] },
   "label=s"                   => sub{ push @diff_opts, "-$_[0]", $_[1]; },
   "line-format=s"             => sub{ push @diff_opts, "-$_[0]", $_[1]; },
   "LTYPE-group-format=s"      => sub{ push @diff_opts, "-$_[0]", $_[1]; },
   "l|paginate"                => sub{ push @diff_opts, "-$_[0]"; },
   "m|method=s"                => \$method,
   "n|rcs"                     => sub{ push @diff_opts, "-$_[0]"; },
   "output-encoding=s"         => sub{ $opts{output_encoding} = $_[1] },
   "p|show-c-function"         => sub{ push @diff_opts, "-$_[0]"; },
   "quiet|silent"              => sub{ $opts{silent} = $_[1] },
   "quote-char=s"              => sub{ $opts{quote_char} = $_[1] },
   "q|brief"                   => sub{ push @diff_opts, "-$_[0]"; },
   "sep-char=s"                => sub{ $opts{sep_char} = $_[1] },
   "setup-code=s"              => sub{ $opts{setup_code} = $_[1] },
   "sheets=s"                  => sub{ push @{$opts{sheet_names}}, $_[1] },
   "show-empties"              => sub{ $opts{show_empties} = $_[1] },
   "sort-rows!"                => sub{ $opts{sort_rows} = $_[1] }, # allow --no-sort
   "speed-large-files"         => sub{ push @diff_opts, "-$_[0]"; },
   "strip-trailing-cr"         => sub{ push @diff_opts, "-$_[0]"; },
   "suppress-common-lines"     => sub{ $opts{suppress_common_lines} = 1; push @diff_opts, "-$_[0]"; },
   "s|report-identical-files"  => sub{ push @diff_opts, "-$_[0]"; },
   "tabsize=i"                 => sub{ push @diff_opts, "-$_[0]", $_[1]; },
   "title-row=i"               => sub{ $opts{title_row} = $_[1] },
   "t|expand-tabs"             => sub{ push @diff_opts, "-$_[0]"; },
   "T|initial-tab"             => sub{ push @diff_opts, "-$_[0]"; },
   "u|U|unified:i"             => sub{ push @diff_opts, "-$_[0]", $_[1]?($_[1]):(); },
   "v|verbose"                 => sub{ $opts{verbose} = $_[1] },

   # valid only for 'git diff':
   "word-diff-regex=s"         => sub{ push @diff_opts, "-$_[0]=$_[1]"; },
   "word-diff=s"               => sub{ push @diff_opts, "-$_[0]=$_[1]"; },
   "ws-error-highlight=s"      => sub{ push @diff_opts, "-$_[0]=$_[1]"; },

   "w|ignore-all-space"        => sub{ push @diff_opts, "-$_[0]"; },
   "W|width=i"                 => sub{ $maxwidth = $_[1]; push @diff_opts, "-$_[0]", $_[1]; },
   "y|side-by-side"            => sub{ push @diff_opts, "-$_[0]"; },
   "Z|ignore-trailing-space"   => sub{ $opts{ign_trailing_spaces} = 1; push @diff_opts, "-$_[0]"; },
   # Options valid only for 'git diff':
) or badargs_exit(-msg => "$Script -h for help");

call_pod2usage(-verbose => 2, -output => \*STDOUT) if $help;

foreach (@diff_opts) { s/^-(\w\w+)/--$1/ }  # change -longopt to --longopt

foreach my $key (qw/columns id_columns always_show_columns sheet_names/) {
  next unless $opts{$key};
  @{$opts{$key}} = map{ split/(?<!\\),/ } @{$opts{$key}}
}
$opts{verbose} //= $opts{debug};

if ($opts{output_encoding}) {
  my $crlf = grep(/crlf/, PerlIO::get_layers(*STDOUT)) ? ":crlf" : "";
  my $arg = ":raw${crlf}:encoding($opts{output_encoding})";
  warn ">binmode STDOUT/ERR '$arg'\n" if $opts{debug};
  binmode(\*STDOUT, $arg) or die "binmode $arg : $!";
  binmode(\*STDERR, $arg) or die "binmode $arg : $!";
}
($stdout_encoding) = map{ /encoding\((.+)\)/ ? ($1) : () }
                     PerlIO::get_layers(*STDOUT);

($visible_space, $RArrow) =
    ($stdout_encoding =~ /utf/i) 
      ? ("\N{MIDDLE DOT}", "\N{RIGHTWARDS ARROW}\N{HAIR SPACE}")
      : (".",              "-->") ;

our ($hash_func_code, $hashid_func_code, $setup_code_code);
foreach my $argname (qw(hashid-func hash-func setup-code)) {
  (my $optskey = $argname) =~ s/-/_/g;
  next unless defined $opts{$optskey};
  my $source = "package Usercode;" . $opts{$optskey};
  no strict 'refs';
  ${"${optskey}_code"} = eval $source;
  die "Syntax error in Perl code for --$argname option:\n$@" if $@;
  die "--$argname did not produce a sub ref (got ",u(${"${optskey}_code"}),"\n"
    unless ref(${"${optskey}_code"}) eq "CODE";
}
$hashid_func_code //= sub{ Carp::cluck "bug" if grep{! defined} @{$_[0]}; join ",", @{$_[0]} };
$hash_func_code   //= sub{ Carp::cluck "bug" if grep{! defined} @{$_[0]}; join ",", @{$_[0]} };
$setup_code_code  //= sub{};

badargs_exit(-msg => "Two files must be specified") if @ARGV != 2;
$opts{origpath1} = $ARGV[0];
$opts{origpath2} = $ARGV[1];

if ($opts{keep_temps}) {
  my $dir = catfile(tmpdir(), "dstmp");  # /tmp/dstmp
  remove_tree($dir);
  mkdir $dir or die "mkdir $dir : $!";
  warn "> Created ",qsh($dir),"\n";
  $opts{tempdir} = $dir;
} else {
  $opts{tempdir} = tempdir("diffspread_XXXXX", DIR=>tmpdir(), CLEANUP=>1);
}

# Extract the possibly-multiple "sheets" from each spreadsheet into
# separate .csv files and process the corresponding pairs.
# If a specific sheet was specified, then only that sheet will be extracted.
foreach my $N (1,2) {
  my $origpath = $opts{"origpath$N"};
  my $sheet_from_path = sheetname_from_spec($origpath);
  if (defined $sheet_from_path) {
    die "--sheets argument not allowed because filename specifies a sheet:\n",
        "  ", qsh($origpath), "\n"
      if $opts{sheet_names};
  }
  #my $dir = catdir($opts{tempdir},title2ident($origpath));#unique??
  my $dir = catdir($opts{tempdir},"--INFILE$N--");
  remove_tree $dir; make_path $dir || die;
  my $h = convert_spreadsheet($origpath, cvt_to=>"csv",
                                         allsheets => !$sheet_from_path,
                                         outpath => $dir,
                                         silent  => $opts{silent},
                                         verbose => $opts{debug},
                                         debug   => $opts{debug},
                             );
  $opts{cvt_from} = $h->{cvt_from};
  if ($h->{cvt_from} =~ /csv/i && !openhandle($origpath)) {
    # Input was already a CSV (possibly detected by peeking at actual content).
    # The "converted" file has a bogus name; use the original path instead.
    $opts{"csvpaths$N"} = [ $origpath ];
  } else {
    opendir(my $dh, $dir) or die "$dir : $!\n", dvis('$h');
    my @csvpaths = sort map{ catdir($dir,$_) }
                        File::Spec->no_upwards(readdir $dh) ;
    if (0 and @csvpaths == 1) { ## FIXME WHY IS THIS DISABLED ???
                                ##Answer: Because we want to see the sheetname even if user did not specify it
      # Only one sheet in this file: Rename the csv to just show the
      # spreadsheet file name without a redundant [sheetname] suffix.
      my $new_tmpfname = catdir($dir, basename($origpath)).".csv";
      rename $csvpaths[0], $new_tmpfname or die;
      $opts{"csvpaths$N"} = [ $new_tmpfname ];
    } else {
      $opts{"csvpaths$N"} = \@csvpaths;
    }
  }
  my %sheet2csvpath;  # sheetname => csvpath
  foreach (@{ $opts{"csvpaths$N"} }) {
    my ($sheetname,undef,undef) = fileparse($_, qr/\.csv/i);
    die if exists $sheet2csvpath{$sheetname};
    $sheet2csvpath{$sheetname} = $_;
  };
  $opts{"sheet2csvpath$N"} = \%sheet2csvpath;
}

my $onlyone = @{ $opts{"csvpaths1"} } == 1 && @{ $opts{"csvpaths2"} } == 1;

# If --sheets arg(s) were given, diff only pairs of so-named sheets.
# Otherwise if each file contains only one sheet then diff them regardless
# of their names, but with a multi-sheet file diff all (and only) pairs of
# same-named sheets.

my $status = 0;
$opts{sheetname_str} = "";
my @pairs;  # { n1 n2 label1 label2 sheetname_str }
if (@{$opts{sheet_names}//[]}) {
  # Sheet names were specified; corresponding sheets might have different
  # names if the user specified a regex.
  foreach (@{$opts{sheet_names}}) {
    my $spec = main::Differ::compile_if_regex($_);
    if (ref($spec)) {
      my @names1 = grep{ /$spec/s } keys %{ $opts{sheet2csvpath1} };
      die "$spec matches no sheet name in $opts{origpath1}\n"
        unless @names1 > 0;
      die "'$spec' matches multiple sheet names in $opts{origpath1}\n"
        if @names1 > 1;
      my @names2 = grep{ /$spec/s } keys %{ $opts{sheet2csvpath2} };
      die "$spec matches no sheet name in $opts{origpath2}\n"
        unless @names2 > 0;
      die "'$spec' matches multiple sheet names in $opts{origpath2}\n"
        if @names2 > 1;
      push @pairs, { n1 => $names1[0], n2 => $names2[0],
                     sheetname_str => $names1[0] eq $names2[0]
                                 ? vis($names1[0])
                                 : vis($names1[0])."/".vis($names2[0])
                   };
    } else {
      die "sheet '$spec' does not exist in $opts{origpath1}\n"
        unless $opts{sheet2csvpath1}{$spec};
      die "sheet '$spec' does not exist in $opts{origpath2}\n"
        unless $opts{sheet2csvpath2}{$spec};
      push @pairs, { n1 => $spec, n2 => $spec, sheetname_str => vis($spec) };
    }
  }
}
elsif ($onlyone) {
  # User did not specity a sheet name, but there is only one in each file.
  # Compare them regardless of their names.
  my ($name1, $bug1) = keys %{ $opts{sheet2csvpath1} }; die "bug1" if $bug1;
  $opts{path1} = $opts{sheet2csvpath1}{$name1};
  my ($name2, $bug2) = keys %{ $opts{sheet2csvpath2} }; die "bug2" if $bug2;
  # set the displayed names to just the spreadsheet paths sans sheetnames.
  push @pairs, { n1 => $name1, n2 => $name2,
                 label1 => $opts{origpath1}, label2 => $opts{origpath2} };
}
else {
  # User did not specity which sheets, but there are several
  foreach my $name (sort keys %{ $opts{sheet2csvpath1} }) {
    unless ($opts{sheet2csvpath2}{$name}) {
      say "*** sheet '$name' exists ONLY in ",qsh($opts{origpath1}),"\n";
      $status = max($status, 2);
      next;
    }
    push @pairs, { n1 => $name, n2 => $name, sheetname_str => vis($name) };
  }
  foreach my $name (sort keys %{ $opts{sheet2csvpath2} }) {
    unless ($opts{sheet2csvpath1}{$name}) {
      say "*** sheet '$name' exists ONLY in ",qsh($opts{origpath2}),"\n";
      $status = max($status, 2);
      next;
    }
  }
}

foreach my $h (@pairs) {
  my ($name1, $name2, $label1, $label2, $sheetname_str)
              = @$h{qw/n1 n2 label1 label2 sheetname_str/};
  # If no labels provided, default to "/path/to/file.xls[sheetname]"
  $label1 //= form_spec_with_sheetname($opts{origpath1}, $name1);
  $label2 //= form_spec_with_sheetname($opts{origpath2}, $name2);

  $opts{path1} = $opts{sheet2csvpath1}{$name1} // die;
  $opts{path2} = $opts{sheet2csvpath2}{$name2} // die;
  $opts{label1} = $label1;
  $opts{label2} = $label2;
  $opts{sheetname_str} = $sheetname_str // "";
  $status = max($status, &process_pair);
}

exit $status;

sub process_pair {
  # N.B. %opts includes "origpathN" and "pathN" (the latter refers to a csv),
  # "sheetnameN", and "labelN".
  # "sheetname_str" is "" or "title...\n" to print before the first diff.
  # ---- Read data into memory, delete ignored columns, etc. ----
  my $obj = main::Differ->new( %opts );

  # Set data_source() to a unique human-readable representation of each
  # sheet.   This is the caller-specified $opts{nameN}, typically
  # "path/to/file.xls[sheetname]" or just "path/to/file",
  # omitting any common ancestor directory parts.
  my @splits;
  for my $N (1, 2) {
    push @splits, [ File::Spec->splitpath($obj->{"label$N"} // oops) ]; # [volume,dirs,fname]
  }
  if ($splits[0]->[0] eq $splits[1]->[0]) { # same volume
    $splits[0]->[0] = $splits[1]->[0] = "";
    my @dirs1 = File::Spec->splitdir($splits[0]->[1]);
    my @dirs2 = File::Spec->splitdir($splits[1]->[1]);
    while (@dirs1 && @dirs2 && $dirs1[0] eq $dirs2[0]) {
      shift @dirs1; shift @dirs2;
    }
    $splits[0]->[1] = catdir(@dirs1);
    $splits[1]->[1] = catdir(@dirs2);
  }
  for my $N (1, 2) {
    my $i = $N - 1;
    $obj->{"sheet$N"}->data_source(
           catdir($splits[$i]->[0], $splits[$i]->[1], $splits[$i]->[2]) );
  }

  # Preliminary raw comparison
  PRECHECK: {
    my $rows1 = $obj->{sheet1}->rows();
    my $rows2 = $obj->{sheet2}->rows();
    my $num_cols = $obj->{sheet1}->num_cols();
    last PRECHECK unless $obj->{sheet2}->num_cols() == $num_cols
                      && @$rows1 == @$rows2;
    for my $rx (0..$#$rows1) {
      my $r1 = $rows1->[$rx];
      my $r2 = $rows2->[$rx];
      last PRECHECK if first { $r1->[$_] ne $r2->[$_] } 0..($num_cols-1)
    }
    warn "> Paired sheets contain identical data, skipping fancier algos\n"
      if $opts{debug};
warn dvis '## $rows1\n   $rows2\n' if $opts{debug};
    return 0
  }

  if ($method ne "native") {
    foreach my $key (qw/id_columns always_show_columns hashid_func hash_func/) {
      if ($opts{$key}) {
        (my $optname = $key) =~ s/_/-/g;
        die "--${optname} option applies only to the 'native' method\n"
      }
    }
  }
  if ($method eq "diff") {
    $obj->compare_using_diff(['diff', '-u', @diff_opts]);
    return $obj->exit_status;
  }
  if ($method eq "tkdiff") {
    $obj->compare_using_diff([$method, @diff_opts]);
    return $obj->exit_status;
  }
  elsif ($method eq "git") {
    $obj->compare_using_diff([qw/git diff --no-index --color-words/, @diff_opts]);
    return $obj->exit_status;
  }
  elsif ($method eq "gitchars") {
    $obj->compare_using_diff([qw/git diff --no-index --word-diff=color --word-diff-regex=./, @diff_opts]);
    return $obj->exit_status;
  }
  elsif ($method eq "native") {
    $obj->compare_native();
    return $obj->exit_status;
  }
  else {
    die "Don't know comparison method '$method'\n";
  }
}


########################################
#
#
#
########################################
package main::Differ;

use Carp;
use File::Temp qw(tempfile tempdir);
use List::Util qw(first any min max any sum0);
use File::Basename qw(basename dirname fileparse);
use File::Path qw(make_path remove_tree);
use File::Spec::Functions qw(canonpath catfile catdir rootdir tmpdir);
use List::Util qw(min max any first);
use Spreadsheet::Edit qw(:DEFAULT logmsg cx2let let2cx);
use Data::Dumper::Interp;
sub oops(@) { unshift @_, "oops "; require Carp; goto &Carp::confess; }

sub _visualize($);
sub _title_or_origABC($$);

my $seen = {};
sub warnonce(@) {
  my $msg = join "",@_;
  return if $seen->{$msg}++;
  warn $msg;
}

sub cx2origcx($$) {
  my ($sheet, $currcx) = @_;
  #  0 1 2 3 [4] [5] 6 7 [8] 9 10  original
  #  0 1 2 3         4 5     6  7  after deletes
  my $deleted_cxs = $sheet->attributes->{DELETED_CXS} // confess("bug");
  my $nskipped = 0;
  my $oldcx = $currcx;
  for my $dcx (@$deleted_cxs) {
    return($currcx + $nskipped) if $dcx >= $oldcx;
    $oldcx = $dcx;
    ++$nskipped;
  }
  return($currcx + $nskipped);
}
sub cx2origlet($$) { cx2let(&cx2origcx) }

sub compile_if_regex(@) { # compile "/.../msix" strings to qr/.../msix
  my @specs = @_;
  foreach (@specs) {
    if (m#^(/.*/[a-z]*)\z|^m([/\[\{\(\<].*)\z#s) {
      my $regex = eval "qr${1}" // do{
        $@ =~ s/ at \(eval.*//mg;
        die "$@ in $_\n";
      };
      $_ = $regex
    }
  }
  wantarray ? @specs :
  @specs > 1 ? confess("multiple results") :
  $specs[0]
}

sub __truncate($$) {
  my ($aref, $maxwid) = @_;
  my $changed;
  foreach (@$aref) {
    if (length($_) > $maxwid) {
      $changed++;
      # cut before the first actual newline or after visualized newline
      s/\A.+?(?:\\n|(?=\n))\K.*/.../s;
      if (length($_) > $maxwid) {
        my $numdots = max(3, length($_)-$maxwid);
        substr($_, $maxwid - $numdots) = ("." x $numdots);
      }
      #warn dvis '##TT __truncated $_ (len=',length($_),")\n";
    }
  }
  $changed
}

sub __visualize($) {
  local $_ = shift;
  s/\n/\\n/sg;
  s/\t/\\t/g;
  s/([^[:print:]])/ sprintf "\\x{%02x}", ord($1) /eg;
  if ($opts{ign_leading_spaces}) {
    s/^( +)//;
  } else {
    # make leading spaces visible
    s/^( +)/$visible_space x length($1)/e;
  }
  if ($opts{ign_trailing_spaces}) {
    s/( +)$//;
  } else {
    # make trailing spaces visible
    s/( +)$/$visible_space x length($1)/e;
  }
  $_
}

# Read the two files (only one sheet each) into memory and delete
# columns to be ignored.  In each Spreadsheet::Edit sheet :
#
# To find corresponding columns after deleting ignored columns, even if
# the user specified absolute column letters:
#
#  For each in {always_show_columns} and {id_columns} :
#    alias <uniqueident> to the column and set
#      $sheet->attributes->{TRACKING}{ spec => [list of <uniqueident>] }
#      (each 'spec' might be a regex matching multiple values).
#
#  Set attributes->{DELETED_CXS} => [ original cxs ]
#
sub new {
  my $class = shift;
  my %hash = @_;
  warn __PACKAGE__,"->new",hvis(%hash),"\n" if $hash{debug};

  $hash{always_show_columns} //= $hash{id_columns};

  my @columns = @{ $hash{columns} // [] }; # copy; will be mutated
  my $always_show_columns = $hash{always_show_columns} // [];
  my $id_columns = $hash{id_columns} // [];
  my $negated;
  if (@columns) {
    $negated = $columns[0] =~ s/^-//;
    foreach (@columns[1..$#columns]) {
      die "You can not mix negated and non-negated --columns COLSPECS (",
          vis("$columns[0]")," and ",vis($_),")\n"
        unless s/^-// == $negated;
    }
  }
  ##my @sheet_new_opts = ($hash{debug} ? (verbose => 1) : (silent => 1)); # no alias warns
  my @sheet_new_opts = (silent => 1); # no alias warns
  for my $N (1, 2) {
    # Read into memory
    my $path = $hash{"path$N"} // confess "'path$N' option is required";
    my $sh = Spreadsheet::Edit->new(@sheet_new_opts);

    my $title_row_spec = $hash{title_row}; # TODO: Allow separate specification

    $sh->read_spreadsheet({
                           use_gnumeric => 1,
                           quote_char   => $hash{quote_char},
                           sep_char     => $hash{sep_char},
                           encoding     => $hash{encoding},
                           silent  => $hash{silent},
                           verbose => $hash{debug},
                           debug   => $hash{debug},
                           (defined($title_row_spec)
                             ? $title_row_spec == 0
                               ? (title_rx => undef)
                               : (title_rx => ($title_row_spec-1),
                                  required => [compile_if_regex
                                   @$always_show_columns,@$id_columns,@columns],
                                 )
                             : () # allow auto-detect if not specified
                           ),
                          }, $path
                         );

    $hash{"sheet$N"} = $sh;
    $sh->attributes->{ORIG_PATH} = $path;
  }

  my $ncols = 0;
  for my $N (1,2) {
    my $sh = $hash{"sheet$N"};

    $ncols = max($ncols, $sh->num_cols);

    # Create aliases for columns which might move when non-compared
    # columns are deleted later.  The aliases will automatically track,
    # relevant when the user specified absolute column letters.
    # Each <spec> might be a regex which matches multiple columns!
    my %cx2seq;
    foreach my $spec (@$always_show_columns, @$id_columns,
                      @columns #unnecessary???
                     ) {
      my @cxlist = $sh->spectocx(compile_if_regex($spec));

      if (defined(my $prev_aliases = $sh->attributes->{TRACKING}->{ $spec })) {
        # This same spec was seen before (perhaps in both always-show & id),
        # and we already processed the cx(s) the spec specifies, i.e. @cxlist.

        my %seen_cx = map{($_ => 1)} @cxlist; # Sanity check...
        for (@$prev_aliases) {
          oops unless defined delete $seen_cx{ $sh->colx->{$_} };
        }
        oops if keys %seen_cx;
      } else {
        my @aliaslist =
                  map { "__origcx${_}_".$cx2seq{$_}++."_".title2ident($spec) }
                      @cxlist;
        $sh->attributes->{TRACKING}->{ $spec } = \@aliaslist;
        for (0..$#aliaslist) {
          $sh->alias($aliaslist[$_] => $cxlist[$_]);
          oops unless $sh->colx->{$aliaslist[$_]} == $cxlist[$_];
        }
      }
    }

    # Pre-sort the files
    if (@$id_columns && $hash{sort_rows}) {

      my $title_rx = $sh->title_rx;
      $sh->first_data_rx(defined($hash{first_data_row})
                          ? ($hash{first_data_row}-1)
                          : (($title_rx // -1) + 1));

      # FIXME? Can the "Translate --always-show and --id-columns to cx..."
      # be hoisted up here from compare_native()?  Maybe not because
      # that has to be done after deleting ignored columns (otoh maybe we can
      # sort after deleting?)  For now, repeating code from there...
      my @key_cxs = map{
                      map{ $sh->colx->{$_} // oops }
                      @{ $sh->attributes->{TRACKING}->{ $_ } // oops }
                    }
                    @{$hash{id_columns}} ;

      $sh->attributes->{ORIGINAL_RXS} = [
        $sh->sort_rows(sub{
          foreach my $cx (@key_cxs) {
            my $r = $a->[$cx] cmp $b->[$cx];
            return $r if $r;
          }
          0
        })
      ];
    }

    # Remove columns not being compared
    if (@columns) {
      my @cxlist = $sh->spectocx(compile_if_regex(@columns));
      my %delete_cxs;
      if ($negated) {
        %delete_cxs = map{ $_ => 1 } @cxlist;
      } else {
        %delete_cxs = map{ $_ => 1 } 0..$sh->num_cols-1;
        delete @delete_cxs{ @cxlist };
      }
      my @deleted_cxs = sort {$a <=> $b} keys %delete_cxs;
      warn "negated=",vis($negated),
           " delete cxs:",avis(@deleted_cxs),
           " abcs:",avis(map{cx2let} @deleted_cxs),"\n"
         if $hash{debug};
      $sh->delete_cols(@deleted_cxs);
      $sh->attributes->{DELETED_CXS} = \@deleted_cxs;
    } else {
      $sh->attributes->{DELETED_CXS} = [];
    }

    # Change newline etc. and non-printing to visible escapes
    $sh->apply_all(sub{
      foreach (@{ $sh->crow }) {
        $_ = __visualize($_);
      }
    });

    # Truncate overly-long titles
    if (defined(my $title_row = $sh->title_row)) {
      if (__truncate($title_row, $hash{trunc_title_width})) {
        # Is this necessary (or wise) ???
        $sh->title_rx($sh->title_rx); # re-compute %colx
      }
    }
  }

  # Append empty columns if needed to make both sheets the same width
  for my $N (1,2) {
    my $sh = $hash{"sheet$N"};
    while ($sh->num_cols < $ncols) { $sh->insert_col('>$', "") }
  }
  bless \%hash, $class;
}

sub _getncols($$) {
  my ($sh, $rx) = @_;
  $rx >= ($sh->title_rx//0) ? $sh->attributes->{NUMCOLS_USED} : $sh->num_cols
}

sub _title_or_origABC($$) {
  my ($sh, $cx) = @_;
  my $titlerow = $sh->title_row;
  (defined($titlerow) && $titlerow->[$cx] ne "")
    ? $titlerow->[$cx]
    : cx2origlet($sh,$cx)
}

sub _native_output {
  my ($self, $cxlist1, $cxlist2, $diff, $dumbrun) = @_; oops unless @_==5;
  my $restricted_keycols = @{$self->{id_columns}} > 0;
  my $changed;
  $diff->Reset();
  while( $diff->Next() ) {
    my @items1 = $diff->Items(1);
    my @items2 = $diff->Items(2);
    if ($diff->Same() && $restricted_keycols) {
      # Diff looked only at @id_columns and found no changes; there
      # may be other changes, which we manually detect here.
      die "bug" unless @items1 == @items2;
      for my $i (0 .. $#items1) {
        # Each item is [sheet#, rowref, original_rx]
        my ($N1, $row1, $rx1) = @{$items1[$i]};
        my ($N2, $row2, $rx2) = @{$items2[$i]};
        my $hash1 = $hash_func_code->([@{$row1}[ @$cxlist1 ]], $rx1)
             // die "ERROR: hash-func returned undef for row ".($rx1+1)." in ".$self->{"sheet$N1"}->data_source()."\n";
        my $hash2 = $hash_func_code->([@{$row2}[ @$cxlist2 ]], $rx2)
             // die "ERROR: hash-func returned undef for row ".($rx2+1)." in ".$self->{"sheet$N2"}->data_source()."\n";
        if ($hash1 ne $hash2) {
          $self->diff_rows($rx1, $rx2, "CHANGED", $dumbrun);
          $changed = 1;
        } else {
          #print "NOT-CHANGED: ", verbrow_str($rx1,$rx2), "\n"
          #  if $debug && !$dumbrun;
        }
      }
    }
    next
      if $diff->Same();
    $changed = 1;

    if (@items2 == 0) {
      # A hunk of rows deleted from self->{sheet2}
      for my $i (0 .. $#items1) {
        my ($N, $row, $orig_rx) = @{ $items1[$i] };
        $self->show_row($N, $row, $orig_rx, "DELETED", $dumbrun);
      }
    }
    elsif (@items1 == 0) {
      # A hunk of rows added to self->{sheet2}
      for my $i (0 .. $#items2) {
        my ($N, $row, $orig_rx) = @{ $items2[$i] };
        $self->show_row($N, $row, $orig_rx, "ADDED", $dumbrun);
      }
    }
    else {
      # A range replaced
      my $lim = min($#items1, $#items2);
      for my $i (0 .. $lim) {
        if ($restricted_keycols) {
          # Did not match in ID columns, so they must be unrelated
          # records; show as deletes and adds.
          my ($N1, $row1, $rx1) = @{$items1[$i]};
          my ($N2, $row2, $rx2) = @{$items2[$i]};
          #??? When is it not the case that $N1==1 and $N2==2 ???
          $self->show_row($N1, $row1, $rx1, "*Deleted", $dumbrun);
          $self->show_row($N2, $row2, $rx2, "*Added", $dumbrun);
        } else {
          my (undef, undef, $rx1) = @{$items1[$i]};
          my (undef, undef, $rx2) = @{$items2[$i]};
          $self->diff_rows($rx1, $rx2, "Changed", $dumbrun)
        }
      }
      for my $i ($lim+1 .. $#items1) {
        my ($N1, $row1, $rx1) = @{$items1[$i]};
        oops unless $N1==1;
        $self->show_row(1, $row1, $rx1, "Deleted*", $dumbrun);
      }
      for my $i ($lim+1 .. $#items2) {
        my ($N2, $row2, $rx2) = @{$items2[$i]};
        oops unless $N2==2;
        $self->show_row(2, $row2, $rx2, "Added*", $dumbrun);
      }
    }
  }
  return $changed
}#_native_output

sub _check_title_rows {
  my $self = shift;
  my ($sh1, $sh2) = @$self{qw/sheet1 sheet2/};

  if (!defined($self->{title_row})) {
    my sub _titlerow_msg {
      my $sh = shift;
      my $rx = $sh->title_rx;
      defined($rx) ? "--title-row ".($rx+1) : "NO --title-row"
    }
    if ($self->{verbose}) {
      if (u($sh1->title_rx) eq u($sh2->title_rx)) {
        warn "> Auto-detected ", _titlerow_msg($sh1), " in both\n";
      } else {
        warn "> Auto-detected DIFFERENT title rows:\n";
        warn "    ",_titlerow_msg($sh1), " in ", $sh1->data_source, "\n";
        warn "    ",_titlerow_msg($sh2), " in ", $sh2->data_source, "\n";
      }
    }
  } else {
    confess "bug"
      ,vis($sh1->attributes())
      if $self->{title_row}==0 && defined( $sh1->title_rx );
  }
}#_check_title_rows

# Find corresponding columns with the same title or fuzzy-matched titles
sub _pairup_columns {
  my $self = shift;
  my ($sh1, $sh2) = @$self{qw/sheet1 sheet2/};
  # Match titles; if not all matched, try fuzzy alternatives, exhausting all
  # possibilities at each "level of fuzzyness" before trying the next level.
  #
  # Matching by levels avoids wrongly matching a fuzzed title when the same
  # string is a non-fuzzed (or less-fuzzed) title in the other file.
  # For example, with
  #    f1=(Foo, Bar)  f2=(Foo, foo, bar)
  # "Foo" must be matched before trying fuzzed versions; otherwise
  # f1-cx0 might be fuzzed to "foo" and wrongly match f2-cx1 when it
  # should match f2-cx0.
  #
  # Note: Fuzzing can make mistakes.  For example with
  #   f1=(F_oo,Bar) and f2=(Foo, f_oo), no titles match without fuzzing
  # but it is unclear whether f1's "F_oo" should match "Foo" or "f_oo" in f2.
  # Nevertheless, we will pick one and if wrong then the user will get a
  # "change" in every row.

  my sub add_titles($$$) {
    my ($hash, $level, $sh) = @_;
    # Add titles (fuzzed to exactly the specified degree) to a hash unless
    # already present, i.e. never trying a fuzzed title which matches
    # a non-fuzzed (or less-fuzzed) title in the same file.
    my $title_row = $sh->title_row
         // return;
    foreach my $cx (0..$sh->num_cols-1) {
      my $title = $title_row->[$cx];
      my $t = $title;
      {                last if $level==0;
        $t = fc($t);   last if $level==1;
        $t =~ s/[_. ]|\Q${visible_space}\E/-/g;
                       last if $level==2;
        ($t = fc $title) =~ s/\s+|\Q${visible_space}\E//sg;
                       last if $level==3;
        $t =~ s/\W//g; last if $level==4;
        oops
      }
      unless(exists $hash->{$t}) {
        $hash->{$t} = $cx;
        #warn dvis '## $level $title -> $t\n';
      }
    }
  }

  my (@corresp_titles1, @corresp_titles2);
  my (@corresp_cxlist1, @corresp_cxlist2);
  my ($title_row1, $title_row2) = ($sh1->title_row, $sh2->title_row);
  my ($hash1, $hash2) = ({}, {});
  my (%cx1found, %cx2found);
  for my $level (0..4) {
    add_titles($hash1, $level, $sh1);
    add_titles($hash2, $level, $sh2);
    while (my ($ft,$cx1) = each %$hash1) {
      next if $cx1found{$cx1};
      if (defined(my $cx2 = delete $hash2->{$ft})) { # "matching" titles
        oops(dvis '$cx1 $cx2 $ft\n$hash1\n$hash2\%cx1found\n%cx2found')
          if $cx2found{$cx2};
        #$cx1found{$cx1} = $cx2found{$cx2} = 1;
        $cx1found{$cx1} = $cx2found{$cx2} = "1,2:$cx1-$cx2";
        delete $hash1->{$ft};
        push @corresp_cxlist1, $cx1;
        push @corresp_cxlist2, $cx2;
        push @corresp_titles1, $title_row1->[$cx1] // oops;
        push @corresp_titles2, $title_row2->[$cx2] // oops;
      }
    }
    while (my ($ft,$cx2) = each %$hash2) {
      next if $cx2found{$cx2};
      if (defined(my $cx1 = delete $hash1->{$ft})) {
        oops(dvis '$level $cx2 $cx1 $ft\n$hash2\n$hash1\n%cx2found\n%cx1found\n$title_row1->[$cx1] $title_row2->[$cx2]')
          if $cx1found{$cx1};
        #$cx2found{$cx2} = $cx1found{$cx1} = 2;
        $cx1found{$cx1} = $cx2found{$cx2} = "2,1:$cx2-$cx1";
        delete $hash2->{$ft};
        push @corresp_cxlist1, $cx1;
        push @corresp_cxlist2, $cx2;
        push @corresp_titles1, $title_row1->[$cx1] // oops;
        push @corresp_titles2, $title_row2->[$cx2] // oops;
      }
    }
    last unless keys %$hash1; # stop if original titles all matched
  }
  oops unless @corresp_cxlist1 == @corresp_cxlist2;
  oops unless @corresp_cxlist1 == @corresp_titles1;
  oops unless @corresp_cxlist1 == @corresp_titles2;
  { my @order = sort {$corresp_cxlist1[$a] <=> $corresp_cxlist1[$b]}
                     0..$#corresp_cxlist1;
    @corresp_cxlist1 = @corresp_cxlist1[@order];
    @corresp_cxlist2 = @corresp_cxlist2[@order];
    @corresp_titles1 = @corresp_titles1[@order];
    @corresp_titles2 = @corresp_titles2[@order];
  }
  # Assume un-matched titles *in the same column* correspond
  # (if there are no titles, then all columns correspond!)
  # A collective title is synthesized for each pair.
  # Otherwise unmatched titles are considered unpaired.
  my (@unpaired_cxlist1, @unpaired_cxlist2);
  for my $cx (0..($sh1->num_cols-1)) {
    next if defined $cx1found{$cx};
    if (!defined $cx2found{$cx} && $cx < $sh2->num_cols) {
      $cx1found{$cx} = $cx2found{$cx} = 1;
      push @corresp_cxlist1, $cx;
      push @corresp_cxlist2, $cx;
      my $synthetic_title = "(col ".cx2let($cx).")";
      push @corresp_titles1, $synthetic_title;
      push @corresp_titles2, $synthetic_title;
    } else {
      push @unpaired_cxlist1, $cx;
    }
  }
  for my $cx (0..($sh2->num_cols-2)) {
    next if defined $cx2found{$cx};
    push @unpaired_cxlist2, $cx;
  }
  oops unless @corresp_cxlist1 + @unpaired_cxlist1 == $sh1->num_cols;
  oops unless @corresp_cxlist1 + @unpaired_cxlist2 == $sh2->num_cols;

  $sh1->attributes->{CORRESP_TITLES  } = \@corresp_titles1;
  $sh1->attributes->{CORRESP_CXLIST  } = \@corresp_cxlist1;
  $sh1->attributes->{UNPAIRED_CXLIST } = \@unpaired_cxlist1;

  $sh2->attributes->{CORRESP_TITLES  } = \@corresp_titles2;
  $sh2->attributes->{CORRESP_CXLIST  } = \@corresp_cxlist2;
  $sh2->attributes->{UNPAIRED_CXLIST } = \@unpaired_cxlist2;

  warn dvis '$title_row1\n$title_row2\n'
           .'@corresp_titles1\n@corresp_titles2\n'
           .'@corresp_cxlist1\n@corresp_cxlist2\n'
           .'@unpaired_cxlist1\n@unpaired_cxlist2\n'
    if $opts{debug};

}#_pairup_columns

sub compare_native {
  my $self = shift;
  require Algorithm::Diff;

  for my $N (1, 2) {
    my $sh = $self->{"sheet$N"};
    # Determine widest row in rows not preceeding title row (if any),
    # excluding empty cells on the right end of a row.
    my $ndcols = 0;
    my $title_rx = $sh->title_rx;
    $sh->apply_all(sub{
      return if defined($title_rx) && $sh->rx < $title_rx;
      my $n = $sh->num_cols;
      while ($n > 0 && $sh->crow->[$n-1] eq "") { --$n }
      $ndcols = max($ndcols, $n);
    });
    $sh->attributes->{NUMCOLS_USED} = $ndcols;

    # Translate --always-show and --id columns to cx values for each sheet
    foreach(['always_show_columns','ALWAYS_SHOW'],
            ['id_columns','ID_CXS']) {
      my ($option_key, $attr_key) = @$_;
      foreach my $spec (@{$self->{$option_key}}) {
        my $identlist = $sh->attributes->{TRACKING}->{ $spec } // confess;
        foreach (@$identlist) {
          my $cx = $sh->colx->{$_}
            // die "Can not use ignored column '$_' ($spec) for $option_key",
                   " (",$sh->data_source,")\n";

          $sh->attributes->{$attr_key}->{$cx} = 1; # e.g. attributes->{ID_CXS}
        }
      }
    }
  }

  $self->_check_title_rows();

  # Set attributes CORRESP_TITLES, CORRESP_CXLIST, and UNPAIRED_CXLIST
  $self->_pairup_columns();

  my $restricted_keycols = @{$self->{id_columns}} > 0;

  my @cxlist1 = ( @{ $self->{sheet1}->attributes->{CORRESP_CXLIST} // [] },
                  @{ $self->{sheet1}->attributes->{UNPAIRED_CXLIST}     }
                );
  my @cxlist2 = ( @{ $self->{sheet2}->attributes->{CORRESP_CXLIST} // [] },
                  @{ $self->{sheet2}->attributes->{UNPAIRED_CXLIST}     }
                );

  # FIXME: What is difference between unmatched-in-same-columm
  #  pairs in CORRESP_CXLIST and UNPAIRED_CXLIST ??
  #  If *all* unmatched were just put into UNPAIRED_CXLIST in cx order,
  #  I think they would line up exactly the same way as we get here...

  # Corresponding columns, limited to --id-columns if specified, otherwise all
  my (@idcxlist1, @idcxlist2);
  if ($restricted_keycols) {
    @idcxlist1 = sort {$a <=> $b}
                 keys %{ $self->{sheet1}->attributes->{ID_CXS} };
    @idcxlist2 = sort {$a <=> $b}
                 keys %{ $self->{sheet2}->attributes->{ID_CXS} };
  } else {
    @idcxlist1 = @cxlist1;
    @idcxlist2 = @cxlist2;
  }
  my $max_idcxlist_max = max($#idcxlist1, $#idcxlist2);
  my @idpadding1 = ("") x ($max_idcxlist_max - $#idcxlist1);
  my @idpadding2 = ("") x ($max_idcxlist_max - $#idcxlist2);

  # Call user-specified global setup code (defaults to nop).
  # This could edit the Spreadsheet::Edit sheet contents and/or
  # save information in globals (declared with "our") for later use
  # by hashid-func or hash-func.  All these user-defined functions
  # are compiled into package Usercode.
  # FIXME: Probably too late for this to be useful. UNDOCUMENTING for now...
  $setup_code_code->( $self->{"sheet1"},
                      $self->{"sheet2"},
                      \@idcxlist1,
                      \@idcxlist2,
                    );

  my $keygen = sub{
    # Form a hash of the contents of the "id columns" in a given row
    # (or all columns in the absence of --id-column arguments).
    #
    # The Diff algorithm is run using only these hash values, and so rows
    # which represent the same record (i.e. same 'id' values) will
    # be reported as "unchanged" even though they may differ in other columns,
    # and other kinds of hunks represent additions or deletions of identities
    # to the data.
    #
    # OTOH, if there ARE NO ID columns, then the hash covers all columns
    # and only completely-identical rows are reported as "unchanged"
    # and changes may be reported as add, delete, or replace in a somewhat
    # unpredictable way (which is why using --id-columns is helpful).
    #
    # This processing occurs in _native_output()
    #
    # If one file has fewer columns than the other, the narrower rows are
    # padded with empty strings.
    my ($N, $row, $orig_rx) = @{ $_[0] };
    my $sh = $self->{"sheet$N"};
    if ($orig_rx eq u($sh->title_rx)) {
      # Return a special token to force title rows to "match".  This makes
      # changes in titles appear as a "change" and not "delete" + "add".
      return " __TITLE_ROW__ ";
    }
    my $idcxlist  = ($N==1 ? \@idcxlist1  : \@idcxlist2);
    my $idpadding = ($N==1 ? \@idpadding1 : \@idpadding2);

    my $s = $hashid_func_code->([@{$row}[ @$idcxlist ], @$idpadding], $orig_rx);
    $s;
  };

  # Pre-screen all records, ignoring any for which hashid-func returns
  # undef, but remembering the original rx values for display to the user.
  #
  # Note: An alternate implementation would be to pass the hash strings
  # directly to Algorithm::Diff with no keyGen for faster Diff operation
  # (Diff is optimized for this case), remembering original rx values in
  # a side hash.  However this would double memory usage and performance
  # probably matters only with very large spreadsheets when conserving memory
  # is arguably more important than conserving CPU.

  # Two arrays of [sheet#, rowref, original_rx], one for each sheet:
  my @diffargs = ([], []);
  foreach my $N (1,2) {
    my $sh = $self->{"sheet$N"};
    my $rows = $sh->rows;
    for (my $rx=0; $rx <= $#$rows; $rx++) {
      my $row = $rows->[$rx];
      my $diffitem = [$N, $row, $rx];   # [sheet#, rowref, original_rx]
      push @{ $diffargs[$N-1] }, $diffitem if defined($keygen->($diffitem));
    }
  }

  warn dvis '$diffargs[0]\n$diffargs[1]\n' if $self->{debug};

  my $diff = Algorithm::Diff->new(
    $diffargs[0],
    $diffargs[1],
    { keyGen => $keygen }
  );

  # dumbrun==true to determine the width of the widest displayed title
  # without actually printing anything
  $self->{widest_title_width} = 0;
  my $changed = $self->_native_output(\@cxlist1, \@cxlist2, $diff, 1);

  if ($changed) {
    if ($self->{sheetname_str} ne "") {
      my $wid = length($self->{sheetname_str});
      my $sep = "*" x (10 + $wid + 4);
      print "\n$sep\n";
      print "*** SHEET ",$self->{sheetname_str}, " ***\n";
      print "$sep\n";
    }
    $diff->Reset;
    my $changed2 = $self->_native_output(\@cxlist1, \@cxlist2, $diff, 0);
    oops if $changed2 != $changed;
  }
  else {
    if ($opts{debug}) {
      ###TEMP FIXME remove this after verified
      $diff->Reset;
      my $changed2 = $self->_native_output(\@cxlist1, \@cxlist2, $diff, 0);
      oops if $changed2;
    }
  }
  $self->{exit_status} = ($changed ? 1 : 0);
}

# Format a value for display as an indented block.
# Newlines in the input are already converted to visible "\n".
# Actual newlines are appended to these markers and indentation
# inserted before second and subsequent lines.  Quotes are not
# included in the result.
# Usage:
#   printf "%*s: '%s'\n", $twid, $title, fmt_value($valstr,$twid+3);
sub fmt_value($$) {
  my ($str, $indent_width) = @_;
  oops if $str =~ /\n/s;
  my $indent = " " x $indent_width;
  $str =~ s/\\n/\\n\n${indent}/gs;
  if ($maxwidth) { # fold
    my $first_mw = $maxwidth - $indent_width;
    if (length($str) > $first_mw) {
      oops "maxwidth $maxwidth is too narrow\nfmw=$first_mw iw=$indent_width <<$str>>"
        if $first_mw < 20;  # sanity
      $str =~ s/\A([^\n]{$first_mw})([^\n]+)/$1\n${indent}$2/m;
      while ($str =~ s/^([^\n]{$maxwidth})([^\n]+)/$1\n${indent}$2/m) { }
    }
  }
  $str
}
#BEGIN {
#$maxwidth = 30; for my $w (0..$maxwidth*3) {
#  my ($i, $NextL) = (0, "A");
#  my $s = "";
#  for (0..$w-1) {
#    my $d = ($i++ % 10);
#    if ($d==0 && $i > 1) { $d = $NextL++ }
#    $s .= $d;
#    if (($_ % 62)==9) { $s .= "\\n" }
#  }
#  print "Input : $s\n";
#  print "Output: ", fmt_value($s,8), "\n";
#}
#die "Test exit";
#}

sub show_cell {
  my ($self, $title, $v1, $v2, $dumbrun) = @_; oops unless @_ == 5;

  my $widest_title_width = $self->{widest_title_width};
  if ($dumbrun) {
    # $title has been visualized with newline replaced by literal '\n',
    # and they will be displayed just that way, without any wrapping.
    my $tw = length($title);
    if ($tw > $widest_title_width) {
      $self->{widest_title_width} = $tw;
    }
    return
  }

  my $twidth = (2+$widest_title_width);  # 2 for indent

  my $v1str = fmt_value($v1, $twidth+3); # +3 for the ": '"

  my $v2str = fmt_value($v2, $twidth+3)
                if defined($v2) && $v1 ne $v2;

  if (! defined $v2) {
    printf "%*s: '%s'\n", $twidth, $title, $v1str;
  }
  elsif ($v1 eq $v2) {
    printf "%*s: '%s' (unchanged)\n", $twidth, $title, $v1str;
  }
  else {
    my $v1_nl = 1 + @{[ $v1str =~ /\n/sg ]};
    my $v2_nl = 1 + @{[ $v2str =~ /\n/sg ]};
    if ($v2_nl <= 1) {
      my $s = sprintf "%*s: '%s'", $twidth, $title, $v1str;
      if ($v2_nl==1
          && (!$maxwidth
              || length($s)+4+length($RArrow)+length($v2str) <= $maxwidth)
         ) {
        # COL TITLE: 'old text' -> 'new text' (on one line)
        $s .= " $RArrow '$v2str'";
#$s .= "\n<<<". dvis '$v1 $v2 $v1str $v2str>>>';
      }
      else {
        #  COL TITLE: 'old text'
        #          -> 'new text'
        $s .= sprintf "\n%*s $RArrow '%s'",
                      $twidth-length($RArrow), "",
                      $v2str;
      }
      print $s, "\n";
    } else {
      # Long multi-line blocks -- show ala 'diff -u infinity' :
      #  COL TITLE:  unchanged line
      #            + added line
      #            - deleted line
      #              unchanged line
      my @lines1 = split /\\n/, $v1;
      my @lines2 = split /\\n/, $v2;
      my $diff = Algorithm::Diff->new(\@lines1, \@lines2);
      my $title_str  = sprintf "%*s:", $twidth, $title;
      my $indent_str = sprintf "%*s ", $twidth, "";
      my $pfx = $title_str;
      while(my $posn = $diff->Next()) {
        my $d = $diff->Diff;
        next if $d==0 && $opts{suppress_common_lines};
        foreach ($diff->Items(1)) {
          print $pfx, (($d&1) ? "- " : "  "), fmt_value($_, $twidth+3), "\n";
          $pfx = $indent_str;
        }
        next if $d==0; # both lists are the same
        foreach ($diff->Items(2)) {
          print $pfx, (($d&2) ? "+ " : "  "), fmt_value($_, $twidth+3), "\n";
          $pfx = $indent_str;
        }
      }
    }
  }
} #show_cell
sub row_header($$;$) {
  my ($verb, $rx1, $rx2) = @_;
  my $s = "$verb row ".($rx1+1);
  $s .= " (row ".($rx2+1)." in 2nd file)" if defined($rx2) && $rx1 != $rx2;
  sprintf "\n-------- %s %s\n", $s, ('-' x ($maxwidth-10-length($s)));
}
sub show_row {
  #my ($self, $N, $rx, $verb, $dumbrun) = @_; oops if @_ != 5;
  my ($self, $N, $row, $curr_rx, $verb, $dumbrun) = @_; oops if @_ != 6;
  my $sh = $self->{"sheet$N"};
  my $orig_rx = $sh->attributes->{ORIGINAL_RXS}->[$curr_rx] // $curr_rx;
  unless ($dumbrun) {
    print row_header($verb, $orig_rx);
  }
  my sub ABC_label {
    my $cx = shift;
    defined($sh->title_rx()) ? "(".cx2origlet($sh,$cx).")" : cx2let($cx);
  }
  my $title_rx = $sh->title_rx;
  my $corresp_titles  = $sh->attributes->{CORRESP_TITLES};
  my $always_show     = $sh->attributes->{ALWAYS_SHOW};
  my %notseen = map{ ($_=>1) } (0.._getncols($sh,$curr_rx)-1);
  my $all_empty = 1;

  for my $show_always_shown (1, 0) { # Show --id-columns etc. first
    if (defined $corresp_titles) {
      my $corresp_cxlist = $sh->attributes->{CORRESP_CXLIST};
      for my $i (0 .. $#$corresp_cxlist) {
        my $cx = $corresp_cxlist->[$i];
        next if ((!$always_show->{$cx}) ^ (!$show_always_shown));
        my $value = $row->[$cx];
        if ($value ne "" || $self->{show_empties} || $always_show->{ $cx }) {
          my $label = $curr_rx <= ($title_rx//-1)
               ? ABC_label($cx) : $corresp_titles->[$i];
          $self->show_cell($label, $value, undef, $dumbrun);
          $all_empty = 0;
        }
        delete $notseen{$cx};
      }
    }
    for my $cx (@{ $sh->attributes->{UNPAIRED_CXLIST} }) {
      next if (!!$always_show->{ $cx } ^ !!$show_always_shown);
      my $value = $row->[$cx];
      if ($value ne "" || $self->{show_empties} || $always_show->{ $cx }) {
        $self->show_cell(ABC_label($cx), $value, undef, $dumbrun);
        $all_empty = 0;
      }
      delete $notseen{$cx};
    }
  }
  die dvis 'bug\n  %notseen\n  $sh->attributes()\n ' if %notseen;
  print "    (empty row)\n" if $all_empty && !$dumbrun;
}
sub diff_rows {
  my ($self, $rx1, $rx2, $verb, $dumbrun) = @_;
  oops unless @_ == 5;
  my ($sheet1, $sheet2) = @$self{qw/sheet1 sheet2/};
  my $orig_rx1 = $sheet1->attributes->{ORIGINAL_RXS}->[$rx1] // $rx1;
  my $orig_rx2 = $sheet2->attributes->{ORIGINAL_RXS}->[$rx2] // $rx2;
  my $headerline = row_header($verb, $orig_rx1, $orig_rx2);
  my $header_printed = $dumbrun; #suppress during dumb run
  my $row1 = $sheet1->[$rx1] // die "BUG: Undef row in sheet1 at rx1=$rx1\n";
  my $row2 = $sheet2->[$rx2] // die "BUG: Undef row in sheet2 at rx2=$rx2\n";
  my $ncols1 = _getncols($sheet1, $rx1);
  my $ncols2 = _getncols($sheet2, $rx2);
  my %notseen1 = map{ ($_=>1) } (0..$ncols1-1);
  my %notseen2 = map{ ($_=>1) } (0..$ncols2-1);

  my $corresp_titles  = $sheet1->attributes->{CORRESP_TITLES};
  if (defined $corresp_titles) {
    my $corresp_cxlist1 = $sheet1->attributes->{CORRESP_CXLIST};
    my $corresp_cxlist2 = $sheet2->attributes->{CORRESP_CXLIST};
    for my $show_always_shown (1, 0) { # Show --id-columns etc. first
      for my $i (0 .. $#$corresp_cxlist1) {
        my $cx1 = $corresp_cxlist1->[$i];
        next if ((!$sheet1->attributes->{ALWAYS_SHOW}->{$cx1}) ^ (!$show_always_shown));
        my $cx2 = $corresp_cxlist2->[$i];
        delete $notseen1{$cx1};
        delete $notseen2{$cx2};
        if (($row1->[$cx1] ne $row2->[$cx2])
              || $sheet1->attributes->{ALWAYS_SHOW}->{ $cx1 }) {
          print $headerline unless $header_printed++;
          $self->show_cell($corresp_titles->[$i], $row1->[$cx1], $row2->[$cx2], $dumbrun);
        }
      }
    }
  }
  # "Singleton" columns have no corresponding column in the other sheet.
  # Show their cells only if not empty because they don't actually represent
  # different content except when they contain something.
  my $title1_row = $sheet1->title_row;
  for my $cx1 (@{ $sheet1->attributes->{UNPAIRED_CXLIST} }) {
    if ((my $content = $row1->[$cx1]) ne "") {
      print $headerline unless $header_printed++;
      $self->show_cell("f1."._title_or_origABC($sheet1,$cx1), $content, undef, $dumbrun);
    }
    delete $notseen1{$cx1};
  }
  my $title2_row = $sheet2->title_row;
  for my $cx2 (@{ $sheet2->attributes->{UNPAIRED_CXLIST} }) {
    if ((my $content = $row2->[$cx2]) ne "") {
      print $headerline unless $header_printed++;
      $self->show_cell("f2."._title_or_origABC($sheet2,$cx2), $content, undef, $dumbrun);
    }
    delete $notseen2{$cx2};
  }
  die dvis 'bug $rx1 $rx2 $verb %notseen1 %notseen2\n$sheet1->attributes()\n$sheet2->attributes()'
    if keys(%notseen1) or keys(%notseen2);
}

sub compare_using_diff {
  my ($self, $cmd) = @_;
  confess "usage: obj->compare_using_diff([command, args...])"
    unless ref($cmd) eq "ARRAY" && $cmd->[0] =~ /diff|git/i;

#  # With standard diff, we have no way to suppress the header if there are
#  # no diffs, so always print it (it will be "" if no header is ever
#  # appropraite).
#  confess "bug" unless defined $self->{sheetname_str};
#  print $self->{sheetname_str};

  # Write the possibly-edited-for-visibiliity data to temp .csv files,
  # and create symlinks to these csv files for using human-friendly names
  # "labelN" (optionally in subdirs if the labels include "directories").
  # These are used when running [tk]diff for nice output.
  # N.B. this is necessary because tkdiff does not support the --label option!
  # 4/1/23: Ditto for git diff.
  my $tdirdepth = 0;
  foreach my $N (1,2) {
    my $sh = $self->{"sheet$N"};
    ($self->{"linkrpath$N"} = $sh->data_source()) =~ s/^\Q${ \rootdir() }\E//;
    my @dotdots = $self->{"linkrpath$N"} =~ /(?:^|\/)\.\.(?=$|\/)/g;
    $tdirdepth = max($tdirdepth, $#dotdots+1);
    #FIXME: Is the +1 needed?   Verify with actual cases!
  }
  my $tdir = $self->{tempdir} // tempdir(CLEANUP => 1);
  # If linkrpaths contain .. put them in a sufficiently-deep subdir
  # that they will still be within our temp directory structure
  $tdir = catdir($tdir, ("sub" x $tdirdepth));
  make_path($tdir);

  foreach my $N (1,2) {
    $self->{"tmpcsv$N"} = catfile($tdir, "TEMPCSV$N$.csv");
    $self->{"sheet$N"}->write_csv( $self->{"tmpcsv$N"} );
  }

  foreach my $N (1,2) {
    my $sh = $self->{"sheet$N"};
    (my $rpath = $sh->data_source()) =~ s/^\Q${ \rootdir() }\E//;
    $self->{"linkrpath$N"} = $rpath;
    $self->{"link$N"} = canonpath(
            catfile(catdir($tdir,dirname($rpath)), basename($rpath)) );
    warn "### link$N = ",qsh($self->{"link$N"}),
         " rpath=",qsh($rpath),
         " linkpath$N = ",qsh($self->{"linkrpath$N"}), "\n" if $self->{debug};
    make_path( dirname($self->{"link$N"}) );
    # Arrgh! git diff never follows symlinks
    # Also some platforms so not implement symlink
    if ($cmd->[0] =~ /git/
        || ! eval{ symlink $self->{"tmpcsv$N"}, $self->{"link$N"} }) {
      File::Copy::copy($self->{"tmpcsv$N"}, $self->{"link$N"})
        or die "copy ",$self->{"tmpcsv$N"}," ",$self->{"link$N"}," : $!";
    }
  }

  my @cmd = (@$cmd, $self->{"linkrpath1"}, $self->{"linkrpath2"});
  my $pid = fork;
  if ($pid == 0) { #CHILD
    warn "> cd ", qsh($tdir), "\n" if $self->{verbose};
    chdir $tdir or die "chdir $tdir : $!";
    warn "> ",join(" ",map{qsh} @cmd),"\n" if $self->{verbose};
    if ($self->{debug}) {
      warn ">    : ",qsh($cmd[-2])," -> ",main::decode_foruser(readlink $cmd[-2]),"\n"
        if -l $cmd[-2];
      warn ">    : ",qsh($cmd[-1])," -> ",main::decode_foruser(readlink $cmd[-1]),"\n"
        if -l $cmd[-1];
      warn "> @cmd\n";
    }
    exec @cmd;
    die "exec failed: $!";
  }
  waitpid($pid,0);
  $self->{exit_status} = (($? >> 8) | ($? & 0xFF));
}

sub exit_status { $_[0]->{exit_status} }

__END__

=pod

=encoding UTF-8

=head1 NAME

diff_spreadsheets - show differences between spreadsheets/csvs readably

=head1 SYNOPSIS

diff_spreadsheets [OPTIONS] file1.csv file2.xlsx!Sheet1 

diff_spreadsheets [OPTIONS] file1.xls file2.ods  # all coresponding sheets

=head1 DESCRIPTION

CSV files may be always used.  Spreadsheets may be used if 
Libre Office 7.2 or later is installed.

If each input has only one sheet then they are compared regardless
of sheet names.  Each file could be a .CSV, a spreadsheet workbook
containing a single sheet, or a multi-sheet workbook with a
sheet name specified (using the syntax shown).

Otherwise, I<every> sheet contained in each workbook is comapred with the
same-named sheet in the other file, warning about any un-paired sheets.

Tabs, newlines, etc. and non-printable characters are replaced with
escapes like "\t" or "\n" for human consumption.

=head1 OPTIONS

=head2 -m --method [native | diff | tkdiff | git]

I<native> (the default) shows only the changed I<cells> in rows which
differ.   Corresponding columns are identified by title,
and need not be in the same order 
(see also C<--id-columns>).

I<diff>, I<tkdiff>, or I<git> run an external tool on
temporary text (CSV) files created from the inputs, in which ignored columns
have been removed and non-graphic characters changed to escapes.

Most diff(1) options are accepted and passed through, but are not
documented here.

I<git> uses C<git diff> to, by default, color-code changed words;
most git(1) diff options are passed through.

=head2 --title-row [ROWNUMBER]

The row number containing column titles (first==1).
Auto-detected by default if the choice is obvious.
Specify zero if there are no titles.

=head2 --columns COLSPEC[,COLSPEC ...]

=head2 --columns -COLSPEC[,-COLSPEC ...]

Ignore differences in certain columns.

In the first form (not negated), the specified columns are used for
comparisons and others are ignored when deciding if a row changed.
When negated ("-" prefix) the specified columns are ignored.

I<COLSPEC> may be a column title, a /regex/ or m{regex} matching 
one or more titles, a column letter code (A, B, etc.), or an identifer
as described in L<Spreadsheet::Edit>.

=head2 --id-columns COLSPEC[,COLSPEC ...] 

Specify columns which together uniquely B<identify records> (i.e. rows).

Before running any "diff" operation, the data rows in each file are
sorted on the specified columns (comparison is alphabetic), so
that corresponding rows are in the same relative vertical position
in each file.  This makes it more likely that a change 
is detected as a CHANGE and not as confusing separate DELETE and INSERT.

B<C<--no-sort-rows>> disables this sorting so you can pre-sort the
data yourself (for example, if "id columns" should be sorted
numerically).

When using the default "native" diff algorithm, the I<original>
row numbers in each file are displayed in the results, hiding
the effects of sorting.

The "native" algorithm always displays the values of the "id" columns
in changed rows.

=head2 The following options apply only to the 'native' method:


=head2 --always-show COLSPEC[,COLSPEC ...]

When a changed row is displayed the changed cells are shown
plus all cells implied by C<--id-columns> whether changed or not.

The C<--always-show> option supplies an alternative set of columns
to always show instead of those given by C<--id-columns>.  The input
data is still sorted using C<--id-columns>.

=head2 Gory internals of the native diff method

The Diff algorithm is first applied using I<only> the C<id-column>s;
matching rows are assumed to correspond, and the other columns are
then compared and a "change" is reported if there are differences.
If --id-columns is not used, the Diff algorithm is applied using all
columns, and can get temporarily out of sync;
if a row was inserted or deleted adjacent to other rows
which were merely changed,
the result can be a string of "changed" reports which
actually describe pairs of unrelated records.

For even more control, see the B<--hashid-func> option.

=head2 --hashid-func PERLCODE

=head2 --hash-func PERLCODE

These options allow arbitrary filtering of row data before use.
PERLCODE must be an anonymous C<sub> declaration which is called with
two arguments for each row:

  ([cells], $row_index)

and must return a string representing those values, or undef
to ignore the row.

Both default to

  sub { join ",", @{$_[0]} }  # concatenate all cells separated by commas

During the Diff algorithm, I<hashid-func> is used to identify
pairs of rows which represent the same data record,
and then I<hash-func> is used to determine if a pair has
reportable changes.

Specifically: I<hashid-func> is called for each row
passing only values from C<--id-column> (or if
not specified then all columns).  
If undef is returned then that row is ignored.  

If the B<same> string is returned for a pair of
rows from the two files then any reportable differences are shown
as a "change" to the record. I<hash-func> is later called to determine
if there are reportable changes.  

If a B<unique> string is returned by I<hashid_func>,
i.e. there is no corresponding row in the other file, then that row
will be reported as "deleted" or "inserted" and I<hash-func> is not
used for that row.

When I<hash-func> is called it is passed I<all> cells in a row (or those
specified with C<--columns>).  If the result is different for
corresponding rows then a "change" is reported.

B<Argument order:>
When called with a row from the first file, cell values are passed in their natural order;
when called with a row from the second file, 
columns which also exist in the first file are passed first, 
I<in the order they appear in the first file>, 
followed by any columns which exist only in the second file.

If there are no titles then all columns are passed in their natural order.

=for notnow =head2 --setup-code PERLCODE
=for notnow 
=for notnow Allows arbitrary initialization, possibly editing the data in memory
=for notnow or declaring global variables (with C<our>) to be later used by
=for notnow C<--hashid-func> or C<--hash-func> subs.
=for notnow 
=for notnow The setup sub is called with parameters
=for notnow ($sheet1, $sheet2, \@idcxlist1, \@idcxlist2);

The user-defined PERLCODE subs are compiled into the same package.

=head2 --quote-char CHARACTER   (default is ")

=head2 --sep-char CHARACTER     (default is ,)

=head2 --encoding ENCODING      (default is UTF-8)

Used when reading CSV files (see L<Text::CSV>).  The same options are
applied to both input files.

=head2 --quiet

=head2 --verbose

=head2 --debug

=head2 --keep-temps

=head2 -h --help

Probably what you expect.

=head1 SEE ALSO

L<Spreadsheet::Edit>, L<Sreadsheet::Edit::IO>

=head1 AUTHOR

Jim Avera (jim.avera  gmail)

=head1 LICENSE

CC0 1.0 / Public Domain

=cut

#end
