#!/usr/local/bin/perl
### shatterproof.pm ###############################################################################
# 

### INCLUDES ######################################################################################
use strict;
use warnings;
use Carp;

use Switch;
use File::Basename;
use List::Util qw[min max];
use Statistics::Distributions;
use POSIX;
###################################################################################################

### HISTORY #######################################################################################
# Version       Date            Coder   	Comments
# 0.1           2012/03/19      sgovind      	Versioning start point
# 0.2		2012/04/03	sgovind		moved input validation methods and run methods
#						from shatterproof.pl to here
#
my $VERSION = 0.3;

package shatterproof;

### Global Variables ##############################################################################
my $pos = 0;	#used to parse command line variables
my $ARGC;	#stores the number of command line arguments provided

my %chromosome_length = (			#stores the sequence length of each chromosome
			 X => 154913754,
			 Y => 57741652,
			 1 => 247199719,
			 2 => 242751149,
			 3 => 199446827,
			 4 => 191263063,
			 5 => 180837866,
			 6 => 170896993,
			 7 => 158821424,
			 8 => 146274826,
			 9 => 140442298,
			 10 => 135374737,
			 11 => 134452384,
			 12 => 132289534,
			 13 => 114127980,
			 14 => 106360585,
			 15 => 100338915,
			 16 => 88822254,
			 17 => 78654742,
			 18 => 76117153,
			 19 => 63806651,
			 20 => 62435965,
			 21 => 46944323,
			 22 => 49528953
			);

my $TP53_start 	= 1000000*7.57;	#start base pair of the TP53 gene
my $TP53_end 	= 1000000*7.59;	#end base pair of the TP53 gene

my $insertion_data_present = 0;
my $LOH_data_present = 0;


#The values for the following 13 variables are defined in the config.pl file 
our $bin_size;			#number of bases pairs that will be compressed into 1 region when analyzing the genome
						#this value defines how many base pairs are included in one array element in the data_hash_ref varaibles

our $localization_window_size;			#number of regions to sum together when performing sliding window analysis of the genome

our $expected_mutation_density;			#the expected mutation density of translocations in a highly mutated region
						#used to calculate spread factor of translocations
our $low_mutation_density_threshold;		#the mutation density that will be used to call likely regions

our $collapse_regions;				#flag variable
						#value 1:	merge overlapping CNV regions that have the same copy number
						#value 0:	do not merge overlapping CNV regions that have the same copy number. If such 
						#		regions are found an error is thrown

our $outlier_deviation;				#the number of standard deviations away from the mean a value has to be in order to be considered non-significant

our $cnv_weight;				#the scoring formula weight given to the aberrant CNV hallmark 
our $mutation_density_weight;			#the scoring formula weight given to the localization of mutations to a specific region on the chromosome
our $genome_localization_weight; 		#the scoring formula weight given to the localization of mutations to a specific chromosome
our $translocation_weight; 			#the scoring formula weight given to the localization of translocations
our $insertion_breakpoint_weight; 		#the scoring formula weight given to the number of insertions found at translocation breakpoints
our $loh_weight;				#the scoring formula weight given to the amount of heterozygosity that is retained in a mutated region
our $tp53_mutated_weight; 			#the scoring formula weight given to the presents or absence of a TP53 mutation

### SUB METHODS ###################################################################################

#=head2 Sub-Method: run
### run ###########################################################################################
# Description:
#	Main method called by shatterproof.pl
#	Calls primary sub methods
#
# Input variables:
#	$argv_ref:	reference to @ARGV

#=cut
sub run {

	my $argv_ref = shift;		#parse parameters
	my @argv = @{$argv_ref};	#dereference array reference

	my $cnv_directory;		#stores the path to the directory where the CNV input files are found
	my $trans_directory;		#stores the path to the directory where the translocation input files are found
	my $insertion_directory;		#stores the path to the directory where the insertion input files are found
	my $loh_directory;		#stores the path to the directory where the loss of hetrozygosity input files are found
	my $output_directory;		#stores the path to the directory where output files will be placed

	my $config_file_path;		#stores the path to the configuration file

	my $tp53_mutated = 0;		#flag variable to indicate if the TP53 gene should be considered to be mutated

	my @cnv_files;			#list of CNV input files
	my @trans_files;		#list of translocation input files
	my @insertion_files;			#list of insertion input files
	my @loh_files;			#list of LOH input files

	
	my $chromosome_copy_number_count_hash_ref;	#hash
							#key1: 	chromosome eg. 1,2,X,Y
							#key2:	copy-number state eg. 0,1,3,20
							#value:	number of regions with copy number key2

	my $chromosome_cnv_breakpoints_hash_ref;	#hash
							#key: 	chromosome eg. 1,2,X,Y
							#value:	an array storing the start and end points of all cnv regions on key

	my $chromosome_translocation_count_hash_ref;	#hash
							#key1: 	chromosome eg. 1,2,X,Y
							#key2: 	chromosome eg. 1,2,X,Y
							#value:	number of translocations between key1 and key2

	my $chromosome_insertion_count_hash_ref;		#hash
							#key: 	chromosome eg. 1,2,X,Y
							#value:	number of insertions on key

	my $chromosome_loh_breakpoints_hash_ref;	#hash
							#key: 	chromosome eg. 1,2,X,Y
							#value:	an array storing the start and end points of all loh regions on key


	my $genome_trans_breakpoints_hash_ref;		#hash
							#key: 	chromosome eg. 1,2,X,Y
							#value:	an array storing all the translocation breakpoints on key
							
	my $genome_trans_insertion_breakpoints_hash_ref;	#hash
							#key: 	chromosome eg. 1,2,X,Y
							#value:	an array storing only the translocation breakpoints on key that have a insertion with 10 base pairs

	my $genome_mutation_density_hash_ref;		#hash
							#key: 	chromosome eg. 1,2,X,Y
							#value:	the total number of mutation on key divided by the sequence length of key


	my $genome_cnv_data_hash_ref = initialize_genome_hash();	#hash {key1}[index]{key2}
									#key1: 	chromosome eg. 1,2,X,Y
									#value:	an array storing references to hashes which contain information about the 
									#	CNVs in each region of key1. The index of the array indicated the region
									#key2:	'BPcount' -> gives the number of CNV breakpoints in the region.
									#	a number, eg. '1' -> gives the number of subregions within the region that 
									#	have a copy number of 1
									
	my $genome_trans_data_hash_ref = initialize_genome_hash();	#hash {key1}[index]{key2}{key3}
									#key1: 	chromosome eg. 1,2,X,Y
									#value1:an array storing references to hashes which contain information about the 
									#	translocations in each region of key1. The index of the array indicated the region.
									#key2:	'BPcount' -> gives the number of translocation breakpoints in the region
									#	'in' -> gives a reference to a hash that contains information about translocations
									#	into the region
									#	'out' -> gives a reference to a hash that contains information about translocations
									#	out of the region
									#key3:	chromosome eg. 1,2,X,Y
									#value2:the number of subregions in the region that were translocated to key1 from key3 if key2 = 'in'
									#	or
									#	the number of subregions in the region that were translocated from key1 to key3 if key2 = 'out'
	
	my $genome_insertion_data_hash_ref = initialize_genome_hash();	#hash {key1}[index]
									#key1: 	chromosome eg. 1,2,X,Y
									#value:	an array storing the number of insertions in each region of key1
									#	The index of the array indicated the region


	my $genome_cnv_data_windows_hash_ref;		#hash {key1}[index]
							#key1: 	chromosome eg. 1,2,X,Y
							#value:	an array storing the number of CNV breakpoints in each window of the genome.
							#	Each window begins at the region indicated by the array index
							
	my $genome_trans_data_windows_hash_ref;		#hash {key1}[index]
							#key1: 	chromosome eg. 1,2,X,Y
							#value:	an array storing the number of translocation breakpoints in each window of the genome.
							#	Each window begins at the region indicated by the array index
							

	my $genome_mutation_data_windows_hash_ref;	#hash {key1}[index]
							#key1: 	chromosome eg. 1,2,X,Y
							#value:	an array storing the total number of mutation breakpoints in each window of the genome.
							#	Each window begins at the region indicated by the array index

	my $suspect_regions_array_ref;		#reference to array that stores regions where chromothripsis most likely occured. Format: chr start end
	my $likely_regions_array_ref;		#reference to array that stores regions where chromothripsis may have occured. Format: chr start end


	#Validate input arguements and parse them to the correct variables
	validate_input(\@argv, \$cnv_directory, \$trans_directory, \$insertion_directory, \$loh_directory, \$tp53_mutated, \$output_directory, \$config_file_path);

	#Load the values from the config file
	load_config_file($config_file_path);

	print "CNV dir:\t$cnv_directory\n";
	print "Trans dir:\t$trans_directory\n";

	if(defined($insertion_directory)){
		print "insertion dir:\t$insertion_directory\n";
		}

	if(defined($loh_directory)){
		print "LOH dir:\t$loh_directory\n";
		}

	print "Output dir:\t$output_directory\n";

	print "Force TP53 Mutation:\t$tp53_mutated\n\n";

	#Get a list of files for each of the provided input directories
	@cnv_files = glob ("$cnv_directory"."*.spc");
	@trans_files = glob ("$trans_directory"."*.spt");

	if(scalar(@cnv_files)==0 || scalar(@trans_files)==0){
		die "ERROR: no CNV or translocation input files found\n";
		}

	if(defined($insertion_directory)){
		@insertion_files = glob ("$insertion_directory"."*.vcf");
		}

	if(defined($loh_directory)){
		@loh_files = glob ("$loh_directory"."*.spl");
		}

	#Echo a list of all the input files
	$" = "\n\t\t";
	print "CNV files:\t@cnv_files\n\n";
	print "Trans files:\t@trans_files\n\n";
	$" = "\n\t\t";
	if(scalar(@insertion_files)==0){
		print "Indel files:\t-none\n\n";
		}
	else{
		print "Indel files:\t@insertion_files\n\n";
		}
	$" = "\n\t";
	if(scalar(@loh_files)==0){
		print "LOH files:\t-none\n\n";
		}
	else{
		print "LOH files:\t@loh_files\n\n";
		}
	$" = " ";

	#Create the output directory if it does not exist
	mkdir ("$output_directory",0770) unless (-d "$output_directory");

	#Check that the output directory exists
	if(!(-e $output_directory)){
		die "ERROR: could not create directory: $output_directory\n";
		}

	print "\n--analyzing CNV data\n";
	($genome_cnv_data_hash_ref, $chromosome_copy_number_count_hash_ref, $chromosome_cnv_breakpoints_hash_ref) = analyze_cnv_data($output_directory, \@cnv_files, $bin_size, \$tp53_mutated);
	print "---done analyzing CNV data\n\n";

	print "--analyzing translocation data\n";
	($genome_trans_data_hash_ref, $chromosome_translocation_count_hash_ref, $genome_trans_breakpoints_hash_ref) = analyze_trans_data($output_directory, \@trans_files, $bin_size, \$tp53_mutated);
	print "---done analyzing translocation data\n\n";

	#If insertion data was provided then analyze it
	if(defined($insertion_directory)){
		print "--analyzing insertion data\n";
		($genome_insertion_data_hash_ref, $chromosome_insertion_count_hash_ref, $genome_trans_insertion_breakpoints_hash_ref) = analyze_insertion_data($output_directory, \@insertion_files, $bin_size, $genome_trans_breakpoints_hash_ref, \$tp53_mutated);
		print "---done analyzing insertion data\n\n";
		}	

	#Delete intermediate storage
	%$genome_trans_breakpoints_hash_ref = ();
	undef $genome_trans_breakpoints_hash_ref;

	#If LOH data was provided then analyze it
	if(defined($loh_directory)){
		print "--analyzing LOH data\n";
		($chromosome_loh_breakpoints_hash_ref) = analyze_loh_data($output_directory, \@loh_files);
		print "---done analyzing LOH data\n\n";

		#Check that the correct format of the LOH hash has been preserved
		my %loh_hash = %{$chromosome_loh_breakpoints_hash_ref};
		for my $key1 (keys %loh_hash){
			my @a = @{$loh_hash{$key1}};
			my $size = @a;

			if($size % 2 != 0){
				die "ERROR: odd number of loh breakpoints recorded for chromosome $key1\n";
				}
			}

		}

	print "--calculating chromosome mutation densities\n";
	$genome_mutation_density_hash_ref = calculate_genome_localization($output_directory, $chromosome_copy_number_count_hash_ref, $chromosome_translocation_count_hash_ref);
	print "---done calculating chromosome mutation densities\n\n";

	print "--calculating chromosome region mutation densities\n";
	($suspect_regions_array_ref, $likely_regions_array_ref, $genome_cnv_data_windows_hash_ref, $genome_trans_data_windows_hash_ref, $genome_mutation_data_windows_hash_ref) = calculate_chromosome_localization($output_directory, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $bin_size, $localization_window_size);
	print "---done calculating chromosome region mutation densities\n\n";

	print "--analyzing suspect regions\n";
	analyze_suspect_regions($output_directory, $suspect_regions_array_ref, $genome_mutation_density_hash_ref, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_trans_insertion_breakpoints_hash_ref, $bin_size, $localization_window_size, $tp53_mutated, $chromosome_cnv_breakpoints_hash_ref, $chromosome_loh_breakpoints_hash_ref);
	print "---done analyzing suspect regions\n\n";

	print "--analyzing likely regions\n";
	analyze_likely_regions($output_directory, $likely_regions_array_ref, $genome_mutation_density_hash_ref, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $bin_size, $localization_window_size);
	print "---done analyzing likely regions\n\n";

	print "--calculating copy number count\n";
	check_copy_number_count($output_directory, $chromosome_copy_number_count_hash_ref);
	print "---done calculating copy number count\n\n";

	print "--calculating switch count\n";
	check_copy_number_switches($output_directory, $chromosome_copy_number_count_hash_ref);
	print "---done calculating switch count\n\n";

	print "--calculating interchromosomal translocation rate\n";
	calculate_interchromosomal_translocation_rate($output_directory, $chromosome_translocation_count_hash_ref);
	print "---done calculating interchromosomal translocation rate\n";

	}#sub run

#=head2 Sub-Method: validate_input
### validate_input ################################################################################
# Description:
#	Validates command line arguments. Prints error messages if some input if invalid.
#
# Input variables:
#	$argv_ref:		reference to @ARGV
#	$cnv_directory_ref:	reference to variable storing the CNV input directory
#	$trans_directory_ref:	reference to variable storing the translocation input directory
#	$insertion_directory_ref:	reference to variable storing the insertion input directory
#	$loh_directory_ref:	reference to variable storing the LOH input directory
#	$tp53_mutated_ref:	reference to variable storing the tp53 mutated flag
#	$output_directory_ref:	reference to variable storing the output directory
#	$config_file_path_ref:	reference to variable storing the path to the config file

#=cut
sub validate_input {

	#Parse parameters
	my $argv_ref = shift;
	my @argv = @{$argv_ref};

	my $cnv_directory_ref = shift;
	my $trans_directory_ref = shift;
	my $insertion_directory_ref = shift;
	my $loh_directory_ref = shift;
	my $tp53_mutated_ref = shift;
	my $output_directory_ref = shift;
	my $config_file_path_ref = shift;
	
	#Determine number of command line arguements
	$ARGC = @argv;

	#Parse the command line arguements
	switch($ARGC){
		case 0 { usage(0); }	#Print error message if no arguements were entered

		case 1 {		#Check for help option
			if($argv[0] eq "--help"){
				man_text();
				}
			else{
				usage(1);
				}
		       }#case 1

		else   {

			if($argv[$pos] eq "--cnv"){	#Check for the cnv input directory option, this field is mandatory
				next_arg(2);
				$$cnv_directory_ref = $argv[$pos];
				next_arg(3);
				}
			else {
				usage(4);
				}


			if($argv[$pos] eq "--trans"){	#Check for the translocation input directory option, this field is mandatory
				next_arg(5);
				$$trans_directory_ref = $argv[$pos];
				next_arg(6);
				}
			else {
				usage(7);
				}

			if($argv[$pos] eq "--insrt"){	#Check for the insertion input directory option
				next_arg(8);
				$$insertion_directory_ref = $argv[$pos];
				$insertion_data_present = 1;
				next_arg(9);
				}

			if($argv[$pos] eq "--loh"){	#Check for the LOH input directory option
				next_arg(10);
				$$loh_directory_ref = $argv[$pos];
				$LOH_data_present = 1;
				next_arg(11);
				}

			if($argv[$pos] eq "--tp53"){	#Check for the TP53 gene mutation check option
				$$tp53_mutated_ref = 1;
				next_arg(12);
				}

			if($argv[$pos] eq "--config"){	#Check for the config file option, this field is mandatory
				next_arg(13);
				$$config_file_path_ref = $argv[$pos];
				next_arg(14);
				}
			else{
				usage(15);
				}

			if($argv[$pos] eq "--output"){	#Check for the output directory option, this field is mandatory
				next_arg(16);
				$$output_directory_ref = $argv[$pos];
				}
			else {
				usage(17);
				}

			#Check that there are no other command line arguments
			if($pos == $ARGC-1){
				last;
				}
			else{
				usage(18);
				}
		       }#else case
		}#switch($ARGC)

	}#sub validate_input

#=head2 Sub-Method: analyze_cnv_data
### analyze_cnv_data ##############################################################################
# Description:
#	Reads data from files located in the CNV input directory and populates:
#		$genome_cnv_data_hash_ref
#		$chromosome_copy_number_count_hash_ref
#		$chromosome_cnv_breakpoints_hash_ref
#
# Input variables:
#	$output_directory:	stores the path to the output directory
#	$cnv_files_array_ref:	reference to array containing all the CNV input files
#	$bin_size:		stores the size of the bins which the chromosome will be divided into
#	$tp53_mutated_ref:	reference to the tp53 mutated flag

#=cut
sub analyze_cnv_data {

	#Parse the parameters
	my $output_directory = shift;

	my $cnv_files_array_ref = shift;
	my @cnv_files = @$cnv_files_array_ref;

	my $bin_size = shift;
	my $tp53_mutated_ref = shift;

	my %genome_cnv_data = ();	#hash
					#key:	chromosome eg. 1,2,X,Y
					#value:	a reference to an array where each element corresponds to a bin along the 
					#	chromosome

	my @file_data;		#an array storing all the entries from every input file

	my $CURRENT_FILE;	#file handle to the current file that is open
	my $TP53_FILE;		#file handle to the TP53 CNV mutation output file

	my $line;		#stores raw line read in from file
	my @line_data;		#stores tokenized line read in from file

	my %chromosome_copy_number_count = (); 		#hash  {chr}{copy number}{count}
							#key1: chromosome eg. 1,2,X,Y
							#key2: a copy-number state eg 0,1,3,15
							#value: the number of region on key1 that have a copy number of key2	

	my %chromsome_cnv_breakpoints = (		#hash 	{chr}[start and end pairs]
					 X => [],	#key: 	chromosome eg. 1,2,X,Y
					 Y => [],	#value: an array that stored an ordered list of CNV breakpoints on key
					 1 => [],
					 2 => [],
					 3 => [],
					 4 => [],
					 5 => [],
					 6 => [],
					 7 => [],
					 8 => [],
					 9 => [],
					 10 => [],
					 11 => [],
					 12 => [],
					 13 => [],
					 14 => [],
					 15 => [],
					 16 => [],
					 17 => [],
					 18 => [],
					 19 => [],
					 20 => [],
					 21 => [],
					 22 => []
					);

	#Read the contents of the cnv files into memory
	foreach my $file (@cnv_files){
		#Open the file
		open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n";

		#Check that the file is not empty
		if(eof($CURRENT_FILE)){
			close ($CURRENT_FILE);
			die "ERROR: $file is empty\n";
			}

		#Read header line and validate
		$line = <$CURRENT_FILE>;
		chomp($line);

		#Check that the format of the header line is correct
		if(!($line =~ m/^#chr\tstart\tend\tnumber\tquality$/)){
			close ($CURRENT_FILE);
			die "ERROR: header of cnv file $file is invalid\n";
			}

		#Read all the data lines in the file
		while( !(eof($CURRENT_FILE)) ){
			#read data line
			$line = <$CURRENT_FILE>;
			chomp($line);

			#Validate the data line
			if(!($line =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t[0-9]+\t([0-9]+|\.)$/)){
				die "ERROR: invalid line found ($line) in file $file\n";
				}

			#Split the data line and add it to the file_data array
			@line_data = (split (/\t/,$line));

			push(@file_data,[@line_data]);
			}

		close ($CURRENT_FILE);

		}#foreach my $file (@cnv_files)

	#Check that there are no overlapping CNV regions with different copy-numbers 
	@file_data = check_for_overlaps("cnv", \@file_data);


	#Create TP53 directory and output folder
	mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53");
	if(!(-e "$output_directory"."TP53")){
		die "ERROR: could not create folder $output_directory"."TP53\n";
		}

	#Create the TP53 CNV mutation file
	open ($TP53_FILE, ">", "$output_directory"."TP53/TP53.spc") or die "ERROR: could not create file: $output_directory"."TP53/TP53.spc";

	#Print the header of the file (same format as a .spc file)
	print $TP53_FILE "#chr\tstart\tend\tnumber\tquality";

	#For every data line that was read in from an input file,
	#record the CNV mutation in the genome_cnv_data_hash, 
	#record the exact breakpoints of the CNV, 
	#update the chromosome_copy_number_count hash and
	#check if the CNV is in the TP53 region
	for (my $n = 0; $n < scalar(@file_data); $n++){
		my $hash = {};

		#Ensure that the chromosome value is valid
		if(!($file_data[$n][0] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){
			die "WARNING: invalid chromosome field detected, line skipped: @{$file_data[$n]}\n";
			}

		#Parse the chromosome
		my $chr = $2;

		#Increment the copy-number count hash based on the line data
		$chromosome_copy_number_count {$chr}{$file_data[$n][3]}++;

		#Record the exact breakpoints of the CNV
		push (@{$chromsome_cnv_breakpoints{$chr}}, ($file_data[$n][1],$file_data[$n][2])); 

		#Calculate the bin for the start and end breakpoint
		my $start_index = int($file_data[$n][1]/$bin_size);
		my $end_index = int($file_data[$n][2]/$bin_size);

		my $update_bin = sub {
			my $source_chr 	= shift;
			my $index 	= shift;
			my $copy_num	= shift;

			my $genome_hash_ref	= shift;
			my %genome_hash		= %{$genome_hash_ref};
		
			my $hash = ();	

			if(!defined(@{$genome_hash{$source_chr}}[$index])){
				$hash->{'BPcount'} = 1;
				$hash->{$copy_num} = 0.5;
				@{$genome_hash{$source_chr}}[$index] = $hash;
				}
			else{	#if a bin does exist then increment the counts
				${@{$genome_hash{$source_chr}}[$index]}{'BPcount'}++;
				${@{$genome_hash{$source_chr}}[$index]}{$copy_num}+=0.5;
				}
			};

		#Check if a bin exists at the start index
		#if not, create one
		if(!defined(@{$genome_cnv_data{$chr}}[$start_index])){
			$hash->{'BPcount'} = 1;
			$hash->{$file_data[$n][3]} = 0.5;
			@{$genome_cnv_data{$chr}}[$start_index] = $hash;
			}
		else{ 	#If one does exist increment the counts
			${@{$genome_cnv_data{$chr}}[$start_index]}{'BPcount'}++;
			${@{$genome_cnv_data{$chr}}[$start_index]}{$file_data[$n][3]}+=0.5;
			}

		$hash = {};

		#Check if a bin exists at the end index
		#if not, create one
		if(!defined(@{$genome_cnv_data{$chr}}[$end_index])){
			$hash->{'BPcount'} = 1;
			$hash->{$file_data[$n][3]} = 0.5;
			@{$genome_cnv_data{$chr}}[$end_index] = $hash;
			}
		else{ 	#If one does exist increment the counts
			${@{$genome_cnv_data{$chr}}[$end_index]}{'BPcount'}++;
			${@{$genome_cnv_data{$chr}}[$end_index]}{$file_data[$n][3]}+=0.5;
			}


		#Check if the variation was in the TP53 gene
		if(
		   ( $chr ne 'X' && $chr ne 'Y' ) &&
		   ( $chr==17 )
		  ){
			if(
			   ( $file_data[$n][3] != 2 ) &&
			   ( ( $file_data[$n][1] >= $TP53_start && $file_data[$n][1] <= $TP53_end ) || ( $file_data[$n][2] >= $TP53_start && $file_data[$n][2] <= $TP53_end ) )
			  ){
				#If a CNV was found in the TP53 region
				$$tp53_mutated_ref = 1;	#set the TP53 mutated flag

				#Record the mutation in the TP53 CNV file
				print $TP53_FILE "\n";
				for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){
					print $TP53_FILE "$file_data[$n][$i]";
					if($i != scalar(@{$file_data[$n]})-1){
						print $TP53_FILE "\t";
						}#if
					}#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++)
				}#if
			}#if
		}#for (my $n = 0; $n < scalar(@file_data); $n++)

	close($TP53_FILE);

	#return hash
	return(\%genome_cnv_data, \%chromosome_copy_number_count, \%chromsome_cnv_breakpoints);

	}#sub analyze_cnv_data

#=head2 Sub-Method: check_for_overlaps
### check_for_overlaps ############################################################################
# Description:
#	Checks if there were overlapping CNV regions with different copy-numbers in the input files.
#	Also checks if there are any overlapping translocation destinations or overlapping LOH
#	regions.
#
# Input variables:
#	$type:		flag variable indicating which type of overlap to check for "cnv", "trans",
#			or "loh"
#	$file_data_ref:	reference to an array storing all the data lines read in from the specific
#			type of input file

#=cut
sub check_for_overlaps {

	my $type = shift;
	my $file_data_ref = shift;
	my @file_data = @$file_data_ref;

	my $start_overlap = 0;	#Flag variable indicating if the start position of one region is overlapping with 
				#another region

	my $end_overlap = 0;	#Flag variable indicating if the end position of one region is overlapping with
				#another region

	#Check for overlapping regions
	#Compares each entry in the array to every following entry
	for (my $n = 0; $n < scalar(@file_data); $n++){
		for (my $k = $n+1; $k < scalar(@file_data); $k++){

			#Check if the 2 regions in question are on the same chromosome
			if($file_data[$n][0] eq $file_data[$k][0]) {

				#Check if the end point of region 1 is within region 2	
				if($file_data[$n][2]>=$file_data[$k][1] && $file_data[$n][2]<=$file_data[$k][2]){
					$end_overlap = 1;
					}

				#Check if the start point of region 1 is within region 2
				if($file_data[$n][1]>=$file_data[$k][1] && $file_data[$n][1]<=$file_data[$k][2]){
					$start_overlap = 1;
					}

				#If an overlap was detected
				if($start_overlap==1 || $end_overlap==1) {

					#if it was a translocation overlap then throw an error
					if($type eq "trans"){
						die "ERROR: found overlapping translocation source regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
						}

					#if it was a LOH overlap then throw an error
					if($type eq "loh"){
						die "ERROR: found overlapping LOH regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
						}

					#if it was a CNV overlap check if the copy numbers are the same
					#If they are different throw an error
					if(
					   ( $type eq "cnv") &&
					   ( $file_data[$n][3] != $file_data[$k][3] )
					  ){
						die "ERROR: found overlapping regions with different copy number values:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
						}

					#if they are the same
				 	#and the user does not wish to collapse overlapping regions with the same copy number then throw an error
					elsif ($collapse_regions==0) {
						die "ERROR: found overlapping copy number regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
						}

					#if the user wishes to collapses overlapping regions with the same copy number then do so
					elsif ($collapse_regions==1) {
						#Region 2 completely encompasses region 1
						#So replace region 1 with region 2
						if($start_overlap==1 && $end_overlap==1){
							$file_data[$n][1] = $file_data[$k][1];
							$file_data[$n][2] = $file_data[$k][2];	
							}
						#The start point of region 1 is within region 2
						#So replace the start point of region 1 with the start point of region 2
						elsif($start_overlap==1){
							$file_data[$n][1] = $file_data[$k][1];
							}
						#The end point of region 1 is within region 2
						#So replace the end point of region 1 with the end point of region 2
						elsif($end_overlap==1){
							$file_data[$n][2] = $file_data[$k][2];
							}
						#If region 1 was modified then remove region 2 and re-check for overlaps
						if($start_overlap==1 || $end_overlap==1){
							@file_data = (@file_data[0..($k-1),($k+1)..(scalar(@file_data)-1)]);
							$start_overlap = 0;
							$end_overlap = 0;
							$k = $n+1;
							redo;
							}

						}#elsif ($collapse_regions==1)
					}#if($start_overlap==1 || $end_overlap==1)

				#Check if region 1 completely encompasses region 2
				elsif($file_data[$n][1]<=$file_data[$k][1] && $file_data[$n][2]>=$file_data[$k][2]){

					if($type eq "trans"){
						die "ERROR: found overlapping translocation source regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
						}

					if($type eq "loh"){
						die "ERROR: found overlapping LOH regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
						}

					#If the copy numbers are different throw an error
					if(
					   ( $type eq "cnv") &&
					   ( $file_data[$n][3] != $file_data[$k][3] )
					  ){
						die "ERROR: found overlapping regions with different copy number values:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
						}

					#If the copy numbers are the same but the user does not want to collapse then throw an error
					elsif ($collapse_regions==0) {
						die "ERROR: found overlapping copy number regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
						}

					#If the user does wish to collapse then remove the second region
					elsif ($collapse_regions==1) {
						@file_data = (@file_data[0..($k-1),($k+1)..(scalar(@file_data)-1)]);
						}
					}#elsif($file_data[$n][1]<=$file_data[$k][1] && $file_data[$n][2]>=$file_data[$k][2])
				}#if($file_data[$n][0] eq $file_data[$k][0])
			}#for (my $n = 0; $n < scalar(@file_data); $n++)
		}#for (my $n = 0; $n < scalar(@file_data); $n++)

	#Return the updated entries
	return (@file_data);

	}#sub check_for_overlaps

#=head2 Sub-Method: analyze_trans_data
### analyze_trans_data ############################################################################
# Description:
#	Reads data from files located in the trans input directory and popultates:
#		$genome_trans_data_hash_ref
#		$chromosome_translocation_count_hash_ref
#		$genome_trans_breakpoints_hash_ref		
#
# Input variables:
#	$output_directory:	stores the path to the output directory
#	$trans_files_array_ref:	reference to array containing all the translocation 
#				input files
#	$bin_size:		stores the size of the bins which the chromosome will be divided into
#	$tp53_mutated_ref:      reference to the tp53 mutated flag

#=cut
sub analyze_trans_data {

	#Parse the parameters
	my $output_directory = shift;
	my $trans_files_array_ref = shift;
	my @trans_files = @$trans_files_array_ref;

	my $bin_size = shift;

	my $tp53_mutated_ref = shift;

	my %genome_trans_data = ();	#hash
					#key:   chromosome eg. 1,2,X,Y
					#value: a reference to an array where each element corresponds to a bin along the
					#	chromosome

	my @file_data;		#an array storing all the entries from every input file

	my $CURRENT_FILE;	#file handle to the current file that is open
	my $TP53_FILE;		#file handle to the TP53 translocation mutation output file

	my $line;		#stores raw line read in from file
	my @line_data;		#stores tokenized line read in from file

	my $chr1;
	my $chr2;

	my %chromosome_trans_count = (); 	#hash 	{chr}{chr}{count}
						#key1:	chromosome eg. 1,2,X,Y
						#key2: 	chromosome eg. 1,2,X,Y
						#value:	the number of translocations between key1 and key2

	my %genome_trans_breakpoints = 	(		#hash {chr}[array]
					 X => [],	#key:	chromosome eg. 1,2,X,Y
					 Y => [],	#value:	an array storing all the translocation breakpoints
					 1 => [],	#	on key
					 2 => [],
					 3 => [],
					 4 => [],
					 5 => [],
					 6 => [],
					 7 => [],
					 8 => [],
					 9 => [],
					 10 => [],
					 11 => [],
					 12 => [],
					 13 => [],
					 14 => [],
					 15 => [],
					 16 => [],
					 17 => [],
					 18 => [],
					 19 => [],
					 20 => [],
					 21 => [],
					 22 => []
					);


	#Read the contents of the cnv files into memory
	foreach my $file (@trans_files){
		#open the file
		open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n";

		#check that the file is not empty
		if(eof($CURRENT_FILE)){
			close ($CURRENT_FILE);
			die "ERROR: $file is empty\n";
			}

		#read header line and validate
		$line = <$CURRENT_FILE>;
		chomp($line);

		#validate the header line
		if(!($line =~ m/^#chr1\tstart\tend\tchr2\tstart\tend\tquality$/)){
			close ($CURRENT_FILE);
			die "ERROR: header of translocation file $file is invalid\n";
			}

		#read in every data line
		while( !(eof($CURRENT_FILE)) ){
			#read data line
			$line = <$CURRENT_FILE>;
			chomp($line);

			#validate the format of the data line
			if(!($line =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t([0-9]+|\.)$/)){
				#warn "WARNING: invalid line found and skipped: ($line) in file $file\n";
				next;
				}

			#Split the data line and add it to the array
			@line_data = (split (/\t/,$line));

			#if the start position is  greater than the end position for either the source or destination throw an error
			if(
			   ( $line_data[1] >= $line_data[2] ) ||
			   ( $line_data[4] >= $line_data[5] )
			  ){
				#warn "ERROR: invalid line found ($line) in file $file. Start or end values invalid\n";
				#next;
				}

			#Add the data line to the file_data array
			push(@file_data,[@line_data]);
			}

		close ($CURRENT_FILE);

		}#foreach my $file (@trans_files)

	#ignoring overlapping translocations for now
	#@file_data = check_for_overlaps("trans", \@file_data);

	#Create TP53 directory and output folder
	mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53");
	if(!(-e "$output_directory"."TP53")){
		die "ERROR: could not create folder $output_directory"."TP53\n";
		}

	#create TP53 translocation mutation file
	open ($TP53_FILE, ">", "$output_directory"."TP53/TP53.spt") or die "ERROR: could not create file: $output_directory"."TP53/TP53.spt";
	
	#Print the header of the file (same format as a .spt file)
	print $TP53_FILE "#chr1\tstart\tend\tchr2\tstart\tend\tquality";

	#For every data line that was read in from an input file,
	#record the translocation mutation in the genome_trans_data_hash, 
	#record the exact breakpoints of the translocation, 
	#update the chromosome_trans_count hash and
	#check if the translocation is in the TP53 region
	for (my $n = 0; $n < scalar(@file_data); $n++){
		
		my $hash = {};

		#verify that the chromosome 1 is valid
		if(!($file_data[$n][0] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){
			die "ERROR: invalid chromosome field detected in translocation file\n";
			}

		#parse out chromosome 1
		my $chr1 = $2;

		#verify that the chromosome 2 is valid
		if(!($file_data[$n][3] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){
			die "ERROR: invalid chromosome field detected in translocation file\n";
			}

		#parse out chromosome 2
		my $chr2 = $2;

		#calculate the bin where each breakpoint will be placed
		my $start_index1 = int($file_data[$n][1]/$bin_size);
		my $end_index1 = int($file_data[$n][2]/$bin_size);

		my $start_index2 = int($file_data[$n][4]/$bin_size);
		my $end_index2 = int($file_data[$n][5]/$bin_size);

		my $update_bin = sub {
			my $source_chr 	= shift;
			my $dest_chr	= shift;
			my $index 	= shift;
			my $data	= shift;
			my $type	= shift;

			my $genome_hash_ref	= shift;
			my %genome_hash		= %{$genome_hash_ref};
		
			my $hash = ();	

			if(!defined(@{$genome_hash{$source_chr}}[$index])){
				$hash->{'BPcount'} = 1;
				push (@{$hash->{$type}{$dest_chr}}, $data);
				@{$genome_hash{$source_chr}}[$index] = $hash;
				}
			else{	#if a bin does exist then increment the counts
				${@{$genome_hash{$source_chr}}[$index]}{'BPcount'}++;
				push (@{${@{$genome_hash{$source_chr}}[$index]}{$type}{$dest_chr}}, $data);
				}

			};

		#check if a bin exists at $start_index1
		#if not, create one
		if(!defined(@{$genome_trans_data{$chr1}}[$start_index1])){
			$hash->{'BPcount'} = 1;
			push (@{$hash->{'out'}{$chr2}}, $file_data[$n][4]);
			@{$genome_trans_data{$chr1}}[$start_index1] = $hash;
			}
		else{	#if a bin does exist then increment the counts
			${@{$genome_trans_data{$chr1}}[$start_index1]}{'BPcount'}++;
			push (@{${@{$genome_trans_data{$chr1}}[$start_index1]}{'out'}{$chr2}}, $file_data[$n][4]);
			}

		$hash = {};
#		$update_bin->($chr1, $chr2, $end_index1, $file_data[$n][5], 'out', \%genome_trans_data);
		if(!defined(@{$genome_trans_data{$chr1}}[$end_index1])){
			$hash->{'BPcount'} = 1;
			push (@{$hash->{'out'}{$chr2}}, $file_data[$n][5]);
			@{$genome_trans_data{$chr1}}[$end_index1] = $hash;
			}
		else{
			${@{$genome_trans_data{$chr1}}[$end_index1]}{'BPcount'}++;
			push (@{${@{$genome_trans_data{$chr1}}[$end_index1]}{'out'}{$chr2}}, $file_data[$n][5]);
			}

		$hash = {};
#		$update_bin->($chr2, $chr1, $start_index2, $file_data[$n][1], 'in', \%genome_trans_data);
		if(!defined(@{$genome_trans_data{$chr2}}[$start_index2])){
			$hash->{'BPcount'} = 1;
			push (@{$hash->{'in'}{$chr1}}, $file_data[$n][1]);
			@{$genome_trans_data{$chr2}}[$start_index2] = $hash;
			}
		else{
			${@{$genome_trans_data{$chr2}}[$start_index2]}{'BPcount'}++;
			push (@{${@{$genome_trans_data{$chr2}}[$start_index2]}{'in'}{$chr1}}, $file_data[$n][1]);
			}

		$hash = {};
#		$update_bin->($chr2, $chr1, $end_index2, $file_data[$n][2], 'in', \%genome_trans_data);
		if(!defined(@{$genome_trans_data{$chr2}}[$end_index2])){
			$hash->{'BPcount'} = 1;
			push (@{$hash->{'in'}{$chr1}}, $file_data[$n][2]);
			@{$genome_trans_data{$chr2}}[$end_index2] = $hash;
			}
		else{
			${@{$genome_trans_data{$chr2}}[$end_index2]}{'BPcount'}++;
			push (@{${@{$genome_trans_data{$chr2}}[$end_index2]}{'in'}{$chr1}}, $file_data[$n][2]);
			}

#		for (my $i = $start_index1; $i <= $end_index1; $i++){
#			$hash = {};
#			if(!defined(@{$genome_trans_data{$chr1}}[$i])){
#				$hash->{'BPcount'} = 0;
#				$hash->{'in'}{$chr2} = 0.5;
#				@{$genome_trans_data{$chr1}}[$i] = $hash;
#				}
#			else{
#				${@{$genome_trans_data{$chr1}}[$i]}{'out'}{$chr2}++;
#				}
#			} 
#
#		for (my $i = $start_index2; $i <= $end_index2; $i++){
#			$hash = {};
#			if(!defined(@{$genome_trans_data{$chr2}}[$i])){
#				$hash->{'in'}{$chr1} = 1;
#				$hash->{'BPcount'} = 0;
#				@{$genome_trans_data{$chr2}}[$i] = $hash;
#				}
#			else{
#				${@{$genome_trans_data{$chr2}}[$i]}{'in'}{$chr1}++;
#				}
#			} 
	
		#Increment hash translocation counts
		$chromosome_trans_count{$chr1}{$chr2}++;
		#if the translocation is intra-chromosomal then don't count it twice
		if($chr1 ne $chr2){
			$chromosome_trans_count{$chr2}{$chr1}++;
			}

		#store the breakpoints in their bins
		push (@{$genome_trans_breakpoints{$chr1}}, $file_data[$n][1]);
		push (@{$genome_trans_breakpoints{$chr1}}, $file_data[$n][2]);

		push (@{$genome_trans_breakpoints{$chr2}}, $file_data[$n][4]);
		push (@{$genome_trans_breakpoints{$chr2}}, $file_data[$n][5]);

		#Check if the translocation origin was in the TP53 gene
		if(
		   ( $chr1 ne 'X' && $chr1 ne 'Y' ) &&
		   ( $chr1==17 )
		  ){
			if(
			   ( ( $file_data[$n][1] >= $TP53_start && $file_data[$n][1] <= $TP53_end ) || ( $file_data[$n][2] >= $TP53_start && $file_data[$n][2] <= $TP53_end ) )
			  ){
				#if a mutation was found, set the TP53 mutated flag
				$$tp53_mutated_ref = 1;
				print $TP53_FILE "\n";
				#print the translocation data line to the TP53 translocation mutation output file
				for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){
					print $TP53_FILE "$file_data[$n][$i]";
					if($i != scalar(@{$file_data[$n]})-1){
						print $TP53_FILE "\t";
						}
					}#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++)
				}#if
			}#if

		#Check if the translocation destination was in the TP53 gene
		if(
		   ( $chr2 ne 'X' && $chr2 ne 'Y' ) &&
		   ( $chr2==17 )
		  ){
			if(
			   ( ( $file_data[$n][4] >= $TP53_start && $file_data[$n][4] <= $TP53_end ) || ( $file_data[$n][5] >= $TP53_start && $file_data[$n][5] <= $TP53_end ) )
			  ){
				$$tp53_mutated_ref = 1;
				print $TP53_FILE "\n";
				for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){
					print $TP53_FILE "$file_data[$n][$i]";
					if($i != scalar(@{$file_data[$n]})-1){
						print $TP53_FILE "\t";
						}
					}#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++)
				}#if
			}#if
		}#for (my $n = 0; $n < scalar(@file_data); $n++)

	close($TP53_FILE);

	#return hash
	return(\%genome_trans_data, \%chromosome_trans_count, \%genome_trans_breakpoints);

	}#sub analyze_trans_data

### analyze_insertion_data ##############################################################################
# Description:
#	Reads data from files located in the insertion input directory and populates:
#		$genome_insertion_data_hash_ref
#		$chromosome_insertion_count_hash_ref
#		$genome_trans_insertion_breakpoints_hash_ref	
#
# Input variables:
#	$output_directory:			stores the path to the output directory
#	$insertion_files_array_ref:		reference to array containing all the insertion input files
#	$bin_size:				stores the size of the bins which the chromosome will be divided into
#	$genome_trans_breakpoints_hash_ref:	store reference to hash that contains the translocation breakpoints on
#						each chromosome
#	$tp53_mutated_ref:      		reference to the tp53 mutated flag
#
sub analyze_insertion_data {

	#Parse Parameters
	my $output_directory = shift;
	my $insertion_files_array_ref = shift;
	my @insertion_files = @$insertion_files_array_ref;

	my $bin_size = shift;

	my $genome_trans_breakpoints_hash_ref = shift;
	my %genome_trans_breakpoints = %$genome_trans_breakpoints_hash_ref;

	my $tp53_mutated_ref = shift;

	my %genome_insertion_data = ();	#hash
					#key:   chromosome eg. 1,2,X,Y
					#value: a reference to an array where each element corresponds to a bin along the
					#	chromosome

	my $CURRENT_FILE;	#file handle to the current file that is open
	my $TP53_FILE;		#file handle to the TP53 insertion mutation output file

	my $line;		#stores raw line read in from file
	my @line_data;		#stores tokenized line read in from file

	my $chr;

	my $file_name;
	my $path;
	my $suffix;

	my $rm_insertion_file_result;

	my $insertion_found = 0;

	my %chromosome_insertion_count = (); 	#hash {chr}{count}
						#key: chromosome eg. 1,2,X,Y
						#value: the number of insertions found on key

	my %genome_trans_insertion_breakpoints = (	#hash
					 X => [],	#key:	chromosome eg. 1,2,X,Y
					 Y => [],	#value: an array storing all the insertion start positions on key
					 1 => [],
					 2 => [],
					 3 => [],
					 4 => [],
					 5 => [],
					 6 => [],
					 7 => [],
					 8 => [],
					 9 => [],
					 10 => [],
					 11 => [],
					 12 => [],
					 13 => [],
					 14 => [],
					 15 => [],
					 16 => [],
					 17 => [],
					 18 => [],
					 19 => [],
					 20 => [],
					 21 => [],
					 22 => []
					);

	#Create TP53 directory and output folder
	mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53");
	if(!(-e "$output_directory"."TP53")){
		die "ERROR: could not create folder $output_directory"."TP53\n";
		}

	#for each file in the insertion input file array
	foreach my $file (@insertion_files){

		#Parse the file name, path and file type
		( $file_name, $path, $suffix ) = File::Basename::fileparse( $file, "\.[^.]*");

		#open the file
		open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n";

		#ensure that the file is not empty
		if(eof($CURRENT_FILE)){
			die "ERROR: $file is empty\n";
			}

		#create the TP53 insertion mutation output file
		open ($TP53_FILE, ">", "$output_directory"."TP53/$file_name"."$suffix") or die "ERROR: could not create file: $output_directory"."TP53/$file_name"."$suffix";

		#read header lines
		$line = <$CURRENT_FILE>;
		chomp($line);

		#print the VCF header lines to the TP53 insertion mutation output file
		while ($line =~ m/^#(.*?)/){ 
			print $TP53_FILE "$line\n";
			$line = <$CURRENT_FILE>;
			chomp($line);
			}

		#read all the data lines in the file
		while(1){
			@line_data = (split (/\t/,$line));

			#verify that the chromosome is valid and that the mutation is an insertion type
			if(
			   ( !($line_data[1] =~ m/^[0-9]+$/) 				 ) ||
			   ( !($line_data[0] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|x|y|[1-9])/) ) ||
			   ( length($line_data[4]) <= length($line_data[3])		 )
			  ){
				warn "ERROR: invalid chromosome or non-insertion VCF data line found and skipped:\t$line\n";
				$line = <$CURRENT_FILE>;
				unless($line){last;}
				chomp($line);
				next;
				}

			#parse the chromosome
			$chr = $2;

			#change to uppercase if 'x' or 'y' is found
			if($chr eq 'x'){
				$chr = 'X';
				}
			if($chr eq 'y'){
				$chr = 'Y';
				}

			#increment the insertion count of the chromosome	
			$chromosome_insertion_count{$chr}++;

			#check if a bin exists at the insertion start position
			#if one does not, then create one
			if(!defined(@{$genome_insertion_data{$chr}}[int($line_data[1]/$bin_size)])){
				${$genome_insertion_data{$chr}}[int($line_data[1]/$bin_size)] = 1;
				}
			else{	#if one does, then increment the count
				${$genome_insertion_data{$chr}}[int($line_data[1]/$bin_size)]++;
				}

			#Search through the list of translocation breakpoints on the same chromosome
			foreach my $bp (@{$genome_trans_breakpoints{$chr}}){
				#if the insertion is within 10bps of the breakpoints and the insertion to the list stored in
				#the genome_trans_insertion_breakpoints hash
				if( $line_data[1] < $bp+10 && $line_data[1] > $bp-10){
					push (@{$genome_trans_insertion_breakpoints{$chr}}, $bp);
					}
				}

			#Check if the insertion was in the TP53 gene
			if(
			   ( $chr ne 'X' && $chr ne 'Y' ) &&
			   ( $chr==17 )
			  ){
				if($line_data[1] >= $TP53_start && $line_data[1] <= $TP53_end){
					$$tp53_mutated_ref = 1; #if a mutation was found in the region set the TP53 mutated flag
					$insertion_found = 1;
					print $TP53_FILE "$line\n"; 	#print the culprit data line to the TP53 insertion 
									#mutation output file
					}
				}

			#read the next line
			$line = <$CURRENT_FILE>;
			#check that the end of file has not been reached
			unless($line){last;}
			chomp($line);
			}#while(1)

		#close file
		close($CURRENT_FILE);
		close ($TP53_FILE);

		#if an insertion was not found in the current file then delete the created TP53 insertion mutation output file
		if($insertion_found!=1){
			my $dir = "$output_directory"."TP53/$file_name"."$suffix";
			$rm_insertion_file_result = `rm $dir`;
			}

		$insertion_found = 0;
		}#foreach my $file (@insertion_files)

	#return hash
	return(\%genome_insertion_data, \%chromosome_insertion_count, \%genome_trans_insertion_breakpoints);

	}#sub analyze_insertion_data


### analyze_loh_data ##############################################################################
# Description:
#	Reads data from files located in the LOH input directory and populates:
#		$chromosome_loh_breakpoints_hash_ref	
#
# Input variables:
#	$output_directory:	stores the path to the output directory
#	$loh_files_array_ref:	reference to array containing all the LOH input files
#
sub analyze_loh_data {

	#parse the parameters
	my $output_directory = shift;
	my $loh_files_array_ref = shift;
	my @loh_files = @$loh_files_array_ref;

	my $tp53_mutated_ref = shift;

	my @file_data;	#an array storing all the entries from every input file

	my $CURRENT_FILE;	#file handle to the current file that is open
	my $TP53_FILE;		#file handle to the TP53 translocation mutation output file

	my $line;		#stores raw line read in from file
	my @line_data;		#stores tokenized line read in from files

	my %chromsome_loh_breakpoints = (		#hash {chr}[start and end pairs]
					 X => [],	#key: 	chromosome eg. 1,2,X,Y
					 Y => [],	#value:	an array that stores all the LOH breakpoints on key
					 1 => [],
					 2 => [],
					 3 => [],
					 4 => [],
					 5 => [],
					 6 => [],
					 7 => [],
					 8 => [],
					 9 => [],
					 10 => [],
					 11 => [],
					 12 => [],
					 13 => [],
					 14 => [],
					 15 => [],
					 16 => [],
					 17 => [],
					 18 => [],
					 19 => [],
					 20 => [],
					 21 => [],
					 22 => []
					);

	#Read the contents of the cnv files into memory
	foreach my $file (@loh_files){
		#open the file
		open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n";

		#Ensure that the file is not empty
		if(eof($CURRENT_FILE)){
			close ($CURRENT_FILE);
			die "ERROR: $file is empty\n";
			}

		#read header line and validate
		$line = <$CURRENT_FILE>;
		chomp($line);

		#Validate the header line
		if(!($line =~ m/^#chr\tstart\tend\tquality$/)){
			close ($CURRENT_FILE);
			die "ERROR: header of loh file $file is invalid\n";
			}

		#Read all the data lines
		while( !(eof($CURRENT_FILE)) ){
			
			#read data line
			$line = <$CURRENT_FILE>;
			chomp($line);

			#validate the data line
			if(!($line =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t([0-9]+|\.)$/)){
				die "ERROR: invalid line found ($line) in file $file\n";
				}

			#Split the data line and add it to the array
			@line_data = (split (/\t/,$line));
			push(@file_data,[@line_data]);
			}

		close ($CURRENT_FILE);

		}#foreach my $file (@cnv_files)

	#Ensure that there are no overlapping LOH regions, or join them if the user indicated
	@file_data = check_for_overlaps("loh", \@file_data);

	#Create TP53 directory and output folder
	mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53");
	if(!(-e "$output_directory"."TP53")){
		die "ERROR: could not create folder $output_directory"."TP53\n";
		}

	#Create the TP53 LOH mutation output data file
	open ($TP53_FILE, ">", "$output_directory"."TP53/TP53.spl") or die "ERROR: could not create file: $output_directory"."TP53/TP53.spl";
	#Print the header for the output file (same format as a .spl file)
	print $TP53_FILE "#chr\tstart\tend\tquality";

	#For every data line that was read in
	for (my $n = 0; $n < scalar(@file_data); $n++){

		#Validate the chromosome field
		if(!($file_data[$n][0] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){
			die "ERROR: invalid chromosome field detected\n";
			}

		#Parse the chromosome
		my $chr = $2;

		#Add the breakpoints to the array for the chromosome
		push (@{$chromsome_loh_breakpoints{$chr}}, ($file_data[$n][1],$file_data[$n][2])); 


		#Check if the variation was in the TP53 gene
		if(
		   ( $chr ne 'X' && $chr ne 'Y' ) &&
		   ( $chr==17 )
		  ){
			if(
			   ( ( $file_data[$n][1] >= $TP53_start && $file_data[$n][1] <= $TP53_end ) || ( $file_data[$n][2] >= $TP53_start && $file_data[$n][2] <= $TP53_end ) )
			  ){
				#If a mutation was found in the TP53 region then set the TP53 mutated flag
				$$tp53_mutated_ref = 1;

				#Print the LOH data line to the TP53 LOH mutation output file
				print $TP53_FILE "\n";
				for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){
					print $TP53_FILE "$file_data[$n][$i]";
					if($i != scalar(@{$file_data[$n]})-1){
						print $TP53_FILE "\t";
						}#if
					}#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++)
				}#if
			}#if

		}#for (my $n = 0; $n < scalar(@file_data); $n++)

	close($TP53_FILE);

	#return hash
	return(\%chromsome_loh_breakpoints);

	}#sub analyze_loh_data



### calculate_genome_localization #################################################################
# Description:
#	Caculates the mutation density for each chromosome	
#
# Input variables:
#	$output_directory:			  stores the path to the output directory
#	$chromosome_copy_number_count_hash_ref:	  stores a reference to the hash storing the number 
#						  of CNV events on each chromosome
#	#chromosome_translocation_count_hash_ref: stores a reference to the hash storing the number
#						  of translocation events on each chromosome
#
sub calculate_genome_localization {

	#parse the parameters
	my $output_directory = shift;
	my $chromosome_copy_number_count_hash_ref = shift;
	my $chromosome_translocation_count_hash_ref = shift;

	my %chromosome_mutation_count;	#hash
					#key:	chromosome eg. 1,2,X,Y
					#value: the density of translocation and CNV events on key 
	
	my $density;	#store the mutation density for a chromosome

	my $OUTPUT_FILE;	#file handle to the output file

	#initialize all the counts to 0
	for (my $i=1; $i<23; $i++){
		$chromosome_mutation_count{$i} = 0;
		}
	$chromosome_mutation_count{'X'} = 0;
	$chromosome_mutation_count{'Y'} = 0;

	#add the number of CNV events on each chromosome
	for my $cnv_key1 ( keys %$chromosome_copy_number_count_hash_ref){
		for my $cnv_key2 (keys %{$chromosome_copy_number_count_hash_ref->{$cnv_key1}}){
			$chromosome_mutation_count{$cnv_key1} += $chromosome_copy_number_count_hash_ref->{$cnv_key1}->{$cnv_key2};
			}
		}	
	
	#add the number of translocation events on each chromosome
	for my $trans_key1 ( keys %$chromosome_translocation_count_hash_ref){
		for my $trans_key2 (keys %{$chromosome_translocation_count_hash_ref->{$trans_key1}}){
			$chromosome_mutation_count{$trans_key1} += $chromosome_translocation_count_hash_ref->{$trans_key1}->{$trans_key2};
			}
		}	

	#Create the output file
	open ($OUTPUT_FILE, ">", "$output_directory/genome_localization.log") or die "ERROR: could not create file $output_directory/genome_localization.log\n";
	#Print the header
	print $OUTPUT_FILE "#chr\tcount\tdensity\n";

	#for each chromosome print the count and overall density
	for my $chr ( sort keys %chromosome_mutation_count){
			$density = $chromosome_mutation_count{$chr}/$chromosome_length{$chr};
			
			print $OUTPUT_FILE "$chr";
			print $OUTPUT_FILE "\t";
			print $OUTPUT_FILE $chromosome_mutation_count{$chr};
			print $OUTPUT_FILE "\t";
			print $OUTPUT_FILE "$density";
			print $OUTPUT_FILE "\n";

			#Replace the count with the density
			$chromosome_mutation_count{$chr} = $density;
			}	

	close ($OUTPUT_FILE);

	#return the hash containing the densities
	return(\%chromosome_mutation_count);

	}#sub calculate_genome_localization

### calculate_chromosome_localization #############################################################
# Description:
#	Performs a sliding window analysis on the CNV and translocation data. Identifies regions
#	that have a density of mutation much greater than the average rate of mutation of the
#	genome.
#
# Input variables:
#	$output_directory:		stores the directory where output files are created
#	$genome_cnv_data_hash_ref:	reference to hash that stores position of all CNV breakpoints in 
#					the genome
#	$genome_trans_data_hash_ref:	reference to hash that stores position of all the
#					translocation breakpoints in the genome	
#	$bin_size:			size of the bins that divide up the genome
#	$window_size:			number of bins to evaluate in each window
#
sub calculate_chromosome_localization {

	#parse parameters
	my $output_directory = shift;
	my $genome_cnv_data_hash_ref = shift;
	my $genome_trans_data_hash_ref = shift;
	my $bin_size = shift;
	my $window_size = shift;

	my @suspect_regions;	#array storing the start position, end position and chromosome
				#of very highly mutated regions
	my @likely_regions;	#array storing the start position, end position and chromosome
				#of somewhat highly mutated regions

	my $in_suspect_region = 0;	#flag variables used in identifying highly mutated regions
	my $in_likely_region = 0;

	my $suspect_chr = -1;
	my $suspect_start = -1;
	my $suspect_end = -1;
	
	my $likely_chr = -1;
	my $likely_start = -1;
	my $likely_end = -1;

	my %genome_cnv_data_windows = 	(		#hash
					 X => [],	#key: chromosome eg. 1,2,X,Y
					 Y => [],	#value: an array storing the count of CNVs
					 1 => [],	#	in each window along the chromosome
					 2 => [],
					 3 => [],
					 4 => [],
					 5 => [],
					 6 => [],
					 7 => [],
					 8 => [],
					 9 => [],
					 10 => [],
					 11 => [],
					 12 => [],
					 13 => [],
					 14 => [],
					 15 => [],
					 16 => [],
					 17 => [],
					 18 => [],
					 19 => [],
					 20 => [],
					 21 => [],
					 22 => []
					);

	my %genome_trans_data_windows = (		#hash
					 X => [],	#key:	chromosome eg. 1,2,X,Y
					 Y => [],	#value:	an array storing a count of translocation
					 1 => [],	#	in each window along the chromosome
					 2 => [],
					 3 => [],
					 4 => [],
					 5 => [],
					 6 => [],
					 7 => [],
					 8 => [],
					 9 => [],
					 10 => [],
					 11 => [],
					 12 => [],
					 13 => [],
					 14 => [],
					 15 => [],
					 16 => [],
					 17 => [],
					 18 => [],
					 19 => [],
					 20 => [],
					 21 => [],
					 22 => []
					);

	my %genome_mutation_data_windows = (		#hash
					 X => [],	#key:	chromosome eg. 1,2,X,Y
					 Y => [],	#value:	an array storing a count of all mutations
					 1 => [],	#	in each window along the chromosome
					 2 => [],
					 3 => [],
					 4 => [],
					 5 => [],
					 6 => [],
					 7 => [],
					 8 => [],
					 9 => [],
					 10 => [],
					 11 => [],
					 12 => [],
					 13 => [],
					 14 => [],
					 15 => [],
					 16 => [],
					 17 => [],
					 18 => [],
					 19 => [],
					 20 => [],
					 21 => [],
					 22 => []
					);


	my $current_chr;		#current chromosome being analyzed
	my @current_chr_data = ();	#array storing the bins for the current chromosome

	my $genome_mean_mutation_density = 0;	#average density of all the windows across the genome
	my $total_genome_windows = 0;		#total number of windows across the genome
	my $genome_mutation_density_standard_deviation = 0;	#standard deviation of the mutation densities for
								#all the windows

	my $OUTPUT_FILE;	#file handle to output file

	$output_directory = $output_directory."mutation_clustering";

	#create output directories
	mkdir ("$output_directory",0770) unless (-d "$output_directory");
	if(!(-e "$output_directory")){
		die "ERROR: could not create folder $output_directory\n";
		}

	mkdir ("$output_directory/cnv",0770) unless (-d "$output_directory/cnv");
	if(!(-e "$output_directory")){
		die "ERROR: could not create folder $output_directory/cnv\n";
		}
	
	mkdir ("$output_directory/translocations",0770) unless (-d "$output_directory/translocations");
	if(!(-e "$output_directory")){
		die "ERROR: could not create folder $output_directory/translocations\n";
		}

	mkdir ("$output_directory/all_types",0770) unless (-d "$output_directory/all_types");
	if(!(-e "$output_directory")){
		die "ERROR: could not create folder $output_directory/all_types\n";
		}

	#compute the density of CNV mutations in each window
	for my $cnv_key ( keys %$genome_cnv_data_hash_ref){
		#get the array storing the CNV bins for the current chromosome
		@current_chr_data = @{$genome_cnv_data_hash_ref->{$cnv_key}};

		#check that the array is not empty
		if(scalar(@current_chr_data) > 0){

			#create an output file for this chromosome
			open ($OUTPUT_FILE, ">", "$output_directory/cnv/chr$cnv_key"."_cnv_localization.log") or die "ERROR: could not create file $output_directory/cnv/chr$cnv_key"."_cnv_localization.log";
			#print the header for the output file
			print $OUTPUT_FILE "#chr\tstart\tend\tdensity";

			@{$genome_cnv_data_windows{$cnv_key}}[0] = 0;	#initialize the count in the first window

			#Calculate the mutation count in the first window
			for(my $chr_pos = 0; $chr_pos < $window_size; $chr_pos++){
				my %region_hash;
				if(!defined($current_chr_data[$chr_pos])){
					next;
					}
				%region_hash = %{$current_chr_data[$chr_pos]};

				@{$genome_cnv_data_windows{$cnv_key}}[0] += $region_hash{'BPcount'};
				}

			#print the values from the first window to the output file
			print $OUTPUT_FILE "\n";
			print $OUTPUT_FILE "$cnv_key";
			print $OUTPUT_FILE "\t";
			print $OUTPUT_FILE "0";
			print $OUTPUT_FILE "\t";
			print $OUTPUT_FILE ($window_size)*$bin_size;
			print $OUTPUT_FILE "\t";

			if(!defined(@{$genome_cnv_data_windows{$cnv_key}}[0])){
				print $OUTPUT_FILE "0";
				}
			else{
				print $OUTPUT_FILE (POSIX::ceil((@{$genome_cnv_data_windows{$cnv_key}}[0])/2))/($window_size*$bin_size);
				#add the cnv count to the total mutation count for the region
				@{$genome_mutation_data_windows{$cnv_key}}[0] += POSIX::ceil(@{$genome_cnv_data_windows{$cnv_key}}[0]/2);
				}

			#perform the sliding window analysis for the rest of the chromosome
			for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++){

				#check that the window will not overshoot the length of the chromosome			
				if( (($chr_pos+($window_size-1))*$bin_size) > $chromosome_length{$cnv_key}  ){
					last;
					}

				@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos] = 0;	#initialize the count for the current window

				my %past_region_hash;
				my %next_region_hash;

				my $prev_value = 0;
				my $next_value = 0;

				#get the count of the from the first bin from the previous window
				if(defined($current_chr_data[$chr_pos-1])){
					%past_region_hash = %{$current_chr_data[$chr_pos-1]};
					$prev_value = $past_region_hash{'BPcount'};
					}

				#get the count from the bin follow the last bin in the previous window
				if(defined($current_chr_data[$chr_pos+($window_size-1)])){
					%next_region_hash = %{$current_chr_data[$chr_pos+($window_size-1)]};
					$next_value = $next_region_hash{'BPcount'};	
					}

				#the count for the current window = the count from the previous window - the first bin of the previous window + the next bin along the chromosome
				@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos] += (@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos-1]) - ($prev_value) + ($next_value);

				#print the values for this window
				print $OUTPUT_FILE "\n";
				print $OUTPUT_FILE "$cnv_key";
				print $OUTPUT_FILE "\t";
				print $OUTPUT_FILE $chr_pos*$bin_size;
				print $OUTPUT_FILE "\t";
				print $OUTPUT_FILE ($chr_pos+$window_size)*$bin_size;
				print $OUTPUT_FILE "\t";

				if(!defined(@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos])){
					print $OUTPUT_FILE "0";
					}
				else{
					print $OUTPUT_FILE (POSIX::ceil((@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos])/2))/($window_size*$bin_size);
					#add the cnv count to the the total mutation count for the region
					@{$genome_mutation_data_windows{$cnv_key}}[$chr_pos] += POSIX::ceil(@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos]/2);
					}
				}#for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++)

			close ($OUTPUT_FILE);
			}#if(scalar(@current_chr_data) > 0)

		@current_chr_data = ();
		}#for my $cnv_key ( keys %$genome_cnv_data_hash_ref)

	#perform the sliding window analysis on the translocation mutation data
	for my $trans_key ( keys %$genome_trans_data_hash_ref){

		#get the array storing the translocation bins for the current chromosome	
		@current_chr_data = @{$genome_trans_data_hash_ref->{$trans_key}};

		#check that the array is not empty
		if(scalar(@current_chr_data) > 0){

			#create the output file
			open ($OUTPUT_FILE, ">", "$output_directory/translocations/chr$trans_key"."_translocation_localization.log") or die "ERROR: could not create file $output_directory/translocations/chr$trans_key"."_translocation_localization.log";
			#print the header for the output file
			print $OUTPUT_FILE "#chr\tstart\tend\tdensity";

			#initialize the translocation count for the first window
			@{$genome_trans_data_windows{$trans_key}}[0] = 0;

			#calculate the translocation mutation count in the first window
			for(my $chr_pos = 0; $chr_pos < $window_size; $chr_pos++){
				my %region_hash;
				if(!defined($current_chr_data[$chr_pos])){
					next;
					}
				%region_hash = %{$current_chr_data[$chr_pos]};

				my %trans_hash_in;
				my %trans_hash_out;
				my $size = 0;

				#calculate the number of inbound translocation breakpoints
				if(defined($region_hash{'in'})){
					%trans_hash_in = %{$region_hash{'in'}};

					for my $key (keys %trans_hash_in){
						$size = @{$trans_hash_in{$key}};
						$size = $size/2;
						@{$genome_trans_data_windows{$trans_key}}[0] += $size;
						}
					}

				#calculate the number of outbound translocation breakpoints
				if(defined($region_hash{'out'})){
					%trans_hash_out = %{$region_hash{'out'}};

					for my $key (keys %trans_hash_out){
						if($key eq $trans_key){
							next;
							}
						$size = @{$trans_hash_out{$key}};
						$size = $size/2;
						@{$genome_trans_data_windows{$trans_key}}[0] += $size;
						}
					}

				}#for(my $chr_pos = 0; $chr_pos < $window_size; $chr_pos++)

			#print the values from the first window to the output file	
			print $OUTPUT_FILE "\n";
			print $OUTPUT_FILE "$trans_key";
			print $OUTPUT_FILE "\t";
			print $OUTPUT_FILE "0";
			print $OUTPUT_FILE "\t";
			print $OUTPUT_FILE ($window_size)*$bin_size;
			print $OUTPUT_FILE "\t";

			if(!defined(@{$genome_trans_data_windows{$trans_key}}[0])){
				print $OUTPUT_FILE "0";
				}
			else{
				print $OUTPUT_FILE (POSIX::ceil(@{$genome_trans_data_windows{$trans_key}}[0]))/($window_size*$bin_size);
				#add the translocation mutation count to the total mutation count for the region
				@{$genome_mutation_data_windows{$trans_key}}[0] += POSIX::ceil(@{$genome_trans_data_windows{$trans_key}}[0]);
				}

			#perform the sliding window analysis for the rest of the chromosome
			for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++){

				if( (($chr_pos+($window_size-1))*$bin_size) > $chromosome_length{$trans_key} ){
					last;
					}

				@{$genome_trans_data_windows{$trans_key}}[$chr_pos] = 0;
				my %prev_region_hash;
				my %next_region_hash;

				my $prev_value = 0;
				my $next_value = 0;

				#Caculate the number of mutations in the first bin of the previous window
				if(defined($current_chr_data[$chr_pos-1])){
					%prev_region_hash = %{$current_chr_data[$chr_pos-1]};

					my $size = 0;
					my %prev_trans_hash_in;
					my %prev_trans_hash_out;

					if(defined($prev_region_hash{'in'})){
						%prev_trans_hash_in = %{$prev_region_hash{'in'}};

						for my $key (keys %prev_trans_hash_in){
							$size = @{$prev_trans_hash_in{$key}};
							$size = $size/2;
							$prev_value += $size;
							}
					}

					if(defined($prev_region_hash{'out'})){
						%prev_trans_hash_out = %{$prev_region_hash{'out'}};

						for my $key (keys %prev_trans_hash_out){
							if($key eq $trans_key){
								next;
								}
							$size = @{$prev_trans_hash_out{$key}};
							$size = $size/2;
							$prev_value += $size;
							}
						}

					}

				#Caculate the number of mutations in the last bin of the current window
				if(defined($current_chr_data[$chr_pos+($window_size-1)])){
					%next_region_hash = %{$current_chr_data[$chr_pos+($window_size-1)]};

					my $size = 0;
					my %next_trans_hash_in;
					my %next_trans_hash_out;

					if(defined($next_region_hash{'in'})){
						%next_trans_hash_in = %{$next_region_hash{'in'}};

						for my $key (keys %next_trans_hash_in){
							$size = @{$next_trans_hash_in{$key}};
							$size = $size/2;
							$next_value += $size;
							}
					}

					if(defined($next_region_hash{'out'})){
						%next_trans_hash_out = %{$next_region_hash{'out'}};

						for my $key (keys %next_trans_hash_out){
							if($key eq $trans_key){
								next;
								}
							$size = @{$next_trans_hash_out{$key}};
							$size = $size/2;
							$next_value += $size;
							}
						}


					}

				#total number of translocation mutations in the current window = number of mutations in previous window - the first bin of the previous window + next bin along the chromosome
				@{$genome_trans_data_windows{$trans_key}}[$chr_pos] += (@{$genome_trans_data_windows{$trans_key}}[$chr_pos-1]) - ($prev_value) + ($next_value);

				#print values from this window
				print $OUTPUT_FILE "\n";
				print $OUTPUT_FILE "$trans_key";
				print $OUTPUT_FILE "\t";
				print $OUTPUT_FILE $chr_pos*$bin_size;
				print $OUTPUT_FILE "\t";
				print $OUTPUT_FILE ($chr_pos+$window_size)*$bin_size;
				print $OUTPUT_FILE "\t";

				if(!defined(@{$genome_trans_data_windows{$trans_key}}[$chr_pos])){
					print $OUTPUT_FILE "0";
					}
				else{
					print $OUTPUT_FILE (POSIX::ceil(@{$genome_trans_data_windows{$trans_key}}[$chr_pos]))/($window_size*$bin_size);
					@{$genome_mutation_data_windows{$trans_key}}[$chr_pos] += POSIX::ceil(@{$genome_trans_data_windows{$trans_key}}[$chr_pos]);
					}
				}#for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++)


			close ($OUTPUT_FILE);
			}
		@current_chr_data = ();
		}

	#calculate the density of both types of mutations in each window in the genome
	for my $mutation_key ( keys %genome_mutation_data_windows){
	
		#check that some data exisits for the current chromosome
		if(scalar(@{$genome_mutation_data_windows{$mutation_key}}) > 0){

			#create the output file
			open ($OUTPUT_FILE, ">", "$output_directory/all_types/chr$mutation_key"."_mutation_localization.log") or die "ERROR: could not create file $output_directory/all_types/chr$current_chr"."_mutation_localization.log";
			#print the header for the output file
			print $OUTPUT_FILE "#chr\tstart\tend\tdensity";
		
			my $density;

			#for every bin along the chromosome
			for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++){
				$total_genome_windows++;	#increment the total number of windows in the genome
				
				#calculate the density of mutations in the window
				if(!defined(@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])){
					$density = 0;
					}
				else{
					$density = (@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])/($window_size*$bin_size);
					}

				#sum the density values to calculate the mean
				$genome_mean_mutation_density += $density;

				#print the values for the window
				print $OUTPUT_FILE "\n";
				print $OUTPUT_FILE "$mutation_key";
				print $OUTPUT_FILE "\t";
				print $OUTPUT_FILE $chr_pos*$bin_size;
				print $OUTPUT_FILE "\t";
				print $OUTPUT_FILE ($chr_pos+$window_size)*$bin_size;
				print $OUTPUT_FILE "\t";
				print $OUTPUT_FILE $density;

				}#for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++)

			close ($OUTPUT_FILE);
			}
		}#for my $mutation_key ( keys %genome_mutation_data_windows)

	#calculate the mean mutation density for the windows in the genome
	$genome_mean_mutation_density = $genome_mean_mutation_density/$total_genome_windows;	

	#find the sum of squared difference between the density of mutation of each window and the mean density of mutation
	for my $mutation_key ( keys %genome_mutation_data_windows){
		if(scalar(@{$genome_mutation_data_windows{$mutation_key}}) > 0){
			my $density;

			for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++){
				if(!defined(@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])){
					$density = 0;
					}
				else{
					$density = (@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])/($window_size*$bin_size);
					}

				#sum the squared differences
				$genome_mutation_density_standard_deviation += ($density-$genome_mean_mutation_density)**2;
				}
			}
		}#for my $mutation_key ( keys %genome_mutation_data_windows)

	#divided the sum of the squared differnces by the total number of windows and take the square root
	$genome_mutation_density_standard_deviation = ($genome_mutation_density_standard_deviation/$total_genome_windows)**0.5;

	#calculate z scores for each window and check if the window is greater than 2 SDs away from genome mean
	#use this value to identify highly mutated regions
	for my $mutation_key ( keys %genome_mutation_data_windows){
		if(scalar(@{$genome_mutation_data_windows{$mutation_key}}) > 0){
			my $density;
			my $region_z_score = 0;

			for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++){
				
				if(!defined(@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])){
					$density = 0;
					}
				else{
					$density = (@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])/($window_size*$bin_size);
					}

				#calculate z score for the window	
				$region_z_score = ($density-$genome_mean_mutation_density)/$genome_mutation_density_standard_deviation;

				#check if the z score is above the threshold
				if( $region_z_score >= $outlier_deviation ) {
					if($in_suspect_region!=1){
						$suspect_start = $chr_pos*$bin_size;
						$in_suspect_region = 1;
						}
					$suspect_chr = $mutation_key;
					$suspect_end = ($chr_pos+$window_size)*$bin_size;
					}
				elsif ($in_suspect_region==1){
					#once a region has been called push the chromosome, start and end positions into the suspect region array
					push (@suspect_regions, ($suspect_chr,$suspect_start,$suspect_end));
					$suspect_chr = -1;
					$suspect_start = -1;
					$suspect_end = -1;
					$in_suspect_region = 0;
					}

				#check if the z score is below the threshold but still suspicously high
				if(
				   ( $region_z_score < $outlier_deviation ) &&
				   ( $region_z_score >= ($outlier_deviation-1) )
				   ){
					if($in_likely_region!=1){
						$likely_start = $chr_pos*$bin_size;
						$in_likely_region = 1;
						}
					$likely_chr = $mutation_key;
					$likely_end = ($chr_pos+$window_size)*$bin_size;
					}
				elsif ($in_likely_region==1){
					push (@likely_regions, ($likely_chr,$likely_start,$likely_end));
					$likely_chr = -1;
					$likely_start = -1;
					$likely_end = -1;
					$in_likely_region = 0;
					}

				}#for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++)

				#check if the previously analyzed region was suspicious and push its data into the appropriate array
				if ($in_suspect_region==1){
					push (@suspect_regions, ($suspect_chr,$suspect_start,$suspect_end));
					$suspect_chr = -1;
					$suspect_start = -1;
					$suspect_end = -1;
					$in_suspect_region = 0;
					}

				if ($in_likely_region==1){
					push (@likely_regions, ($likely_chr,$likely_start,$likely_end));
					$likely_chr = -1;
					$likely_start = -1;
					$likely_end = -1;
					$in_likely_region = 0;
					}
			}
		}#for my $mutation_key ( keys %genome_mutation_data_windows)


	return (\@suspect_regions, \@likely_regions, \%genome_cnv_data_windows, \%genome_trans_data_windows, \%genome_mutation_data_windows);

	}#sub calculate_chromosome_localization

### check_copy_number_count #######################################################################
# Description:
#	Produces an output file that records the number of regions of copy-number variation that
#	are present in each chromosome.	
#
# Input variables:
#	$output_directory:			stores the path to the output directory	
#	$chromosome_copy_number_count_hash_ref:	reference to hash that stores the count of regions
#						of copy-number variation on each chromosome
#
sub check_copy_number_count {

	#parse parameters
	my $output_directory = shift;
	my $chromosome_copy_number_count_hash_ref = shift;

	my $OUTPUT_FILE;	#file handle to output file

	#open the output file
	open ($OUTPUT_FILE, ">", "$output_directory/copy_number_count.log") or die "ERROR: could not create file $output_directory/copy_number_count.log\n";

	#print the header
	print $OUTPUT_FILE "#chr\tcopy_number\tnumber_of_regions";

	#for each chromosome
	#print out the number of regions with the given copy-number
	for my $chr (sort keys %$chromosome_copy_number_count_hash_ref){
		my %intermediate_hash = %{$chromosome_copy_number_count_hash_ref->{$chr}};
		for my $CN (sort {$intermediate_hash{$b} <=> $intermediate_hash{$a} } keys %intermediate_hash){
			print $OUTPUT_FILE "\n";
			print $OUTPUT_FILE "$chr";	#chromosome
			print $OUTPUT_FILE "\t";
			print $OUTPUT_FILE "$CN";	#copy-number
			print $OUTPUT_FILE "\t";
			print $OUTPUT_FILE $chromosome_copy_number_count_hash_ref->{$chr}->{$CN};	#number of regions with copy-number $CN
			}
		}

	close ($OUTPUT_FILE);
		
	}#sub check_copy_number_count

### check_copy_number_switches ####################################################################
# Description:
#	Creates an output file that records the number of breakpoints between CNV regions on each
#	chromosome	
#
# Input variables:
#	$output_directory:			stores path to output directory	
#	$chromosome_copy_number_count_hash_ref:	reference to hash that stores the count of regions
#						of copy-number variation on each chromosome
#
sub check_copy_number_switches {

	#parse parameters
	my $output_directory = shift;
	my $chromosome_copy_number_count_hash_ref = shift;

	my $switch_count = 0;

	my $OUTPUT_FILE;	#file handle to output file

	#open output file
	open ($OUTPUT_FILE, ">", "$output_directory/copy_number_switches.log") or die "ERROR: could not create file $output_directory/copy_number_switches.log\n";

	#print header
	print $OUTPUT_FILE "#chr\tswitch_count";

	#for each chromosome
	#sum the total number of CNV events
	#multiply the sum by 2 to get the number of CNV breakpoints
	for my $chr (sort keys %$chromosome_copy_number_count_hash_ref){
		my %intermediate_hash = %{$chromosome_copy_number_count_hash_ref->{$chr}};
		for my $CN (sort {$intermediate_hash{$b} <=> $intermediate_hash{$a} } keys %intermediate_hash){
			#only count regions that have abberant copy numbers
			if($CN != 2){
				$switch_count += ($chromosome_copy_number_count_hash_ref->{$chr}->{$CN} * 2);
				}
			}

		#print values to output file
		print $OUTPUT_FILE "\n";
		print $OUTPUT_FILE "$chr";
		print $OUTPUT_FILE "\t";
		print $OUTPUT_FILE $switch_count;

		$switch_count = 0;
		}

	close ($OUTPUT_FILE);
		
	}#sub check_copy_number_switches

### calculate_interchromosomal_translocation_rate #################################################
# Description:
#	Create an output file that records the number of translocations between each and every
#	chromosome	
#
# Input variables:
#	$output_directory:				stores path to output directory	
#	$chromosome_translocation_count_hash_ref:	reference to hash that stores the count of 
#							translocations between each chromosome
#
sub calculate_interchromosomal_translocation_rate {

	#parse parameters
	my $output_directory = shift;
	my $chromosome_translocation_count_hash_ref = shift;

	my $OUTPUT_FILE;	#file handle to output file

	#open output file
	open ($OUTPUT_FILE, ">", "$output_directory/interchromosomal_translocation_rate.log") or die "ERROR: could not create file $output_directory/interchromosomal_translocation_rate.log\n";

	#print header
	print $OUTPUT_FILE "#chr1\tchr2\tcount";

	#for each chromosome
	#print the number of translocations between every other chromosome
	for my $chr1 (sort keys %$chromosome_translocation_count_hash_ref){
		my %intermediate_hash = %{$chromosome_translocation_count_hash_ref->{$chr1}};

		for my $chr2 (sort {$intermediate_hash{$b} <=> $intermediate_hash{$a} } keys %intermediate_hash){
			print $OUTPUT_FILE "\n";
			print $OUTPUT_FILE $chr1;
			print $OUTPUT_FILE "\t";
			print $OUTPUT_FILE $chr2;
			print $OUTPUT_FILE "\t";
			print $OUTPUT_FILE $chromosome_translocation_count_hash_ref->{$chr1}->{$chr2};
			}
		}

	close ($OUTPUT_FILE);
		
	}#sub calculate_interchromosomal_translocation_rate


### analyze_suspect_regions #######################################################################
# Description:
#	Produces the final report output file, that includes the chromothriptic scores for each of
#	the highly mutated regions	
#
# Input variables:
#	$output_directory:				stores path to the output directory
#	$suspect_regions_array_ref:			reference to array storing the chromosome,
#							start, and end position of highly mutated
#							regions
#	$genome_mutation_density_hash_ref:		stores the average mutation density of each
#							chromosome	
#	$genome_cnv_data_hash_ref:			stores the position of CNV mutations on
#							each chromosome
#	$genome_trans_data_hash_ref:			stores the position of translocation events
#							on each chromosome
#	$genome_trans_insertion_breakpoints_hash_ref:	stores the position of insertions on each
#							chromosome
#	$bin_size:					stores the size of a single bin
#	$localization_window_size:			stores the number of bins to include in a
#							window
#	$tp53_mutated:					stores whether the TP53 gene is mutatated
#							or not
#	$chromosome_cnv_breakpoints_hash_ref:		stores the breakpoints of CNV mutations on
#							each chromosome
#	$chromosome_loh_breakpoints_hash_ref:		stores the breakpoints of LOH regions on
#							each chromosome
#
sub analyze_suspect_regions {

	#parse parameters
	my $output_directory 				= shift;

	my $suspect_regions_array_ref 			= shift;
	my @suspect_regions 				= @$suspect_regions_array_ref;

	my $genome_mutation_density_hash_ref 		= shift;
	my %genome_mutation_density_hash		= %{$genome_mutation_density_hash_ref};
	
	my $genome_cnv_data_hash_ref 				= shift;
	my $genome_trans_data_hash_ref				= shift;
	my $genome_trans_insertion_breakpoints_hash_ref 	= shift;

	my $bin_size 					= shift;
	my $localization_window_size			= shift;

	my $tp53_mutated				= shift;

	my $chromosome_cnv_breakpoints_hash_ref		= shift;
	my $chromosome_loh_breakpoints_hash_ref		= shift;


	my $suspect_regions_size = @suspect_regions;	

	my $OUTPUT_FILE;

	my @suspect_region_data = ();

	my $header_string;

	#check that the suspect region data array is not malformed, should contain sets of 3 elements
	if($suspect_regions_size % 3 != 0){
		die "ERROR: suspect_regions_array has $suspect_regions_size entries. Value must be divisible by 3.\n";
		}

	#create output directory for report files	
	mkdir ("$output_directory"."suspect_regions",0770) unless (-d "$output_directory"."suspect_regions");
	if(!(-e "$output_directory"."suspect_regions")){
		die "ERROR: could not create folder $output_directory"."suspect_regions";
		}

	#open final report output file
	open ($OUTPUT_FILE, ">", "$output_directory"."suspect_regions/suspect_regions.yml") or die "ERROR: could not create file: $output_directory"."suspect_regions/suspect_regions.yml\n";

	#construct and print header
	$header_string = "file: Suspect Chromothriptic Regions\n";
	$header_string .= "bin_size:\t\t$bin_size\n";
	$header_string .= "localization_window_size:\t$localization_window_size\n";
	$header_string .= "\n";
	$header_string .= "cnv_score_weight:\t\t\t$cnv_weight\n";
	$header_string .= "mutation_density_score_weight:\t\t$mutation_density_weight\n";
	$header_string .= "genome_localization_score_weight:\t$genome_localization_weight\n";
	$header_string .= "translocation_score_weight:\t\t$translocation_weight\n";
	$header_string .= "insertion_breakpoint_score_weight:\t\t$insertion_breakpoint_weight\n";
	$header_string .= "loh_score_weight:\t\t\t$loh_weight\n";
	$header_string .= "tp53_mutation_score_weight:\t\t$tp53_mutated_weight\n";
	$header_string .= "\n";
	$header_string .= "min_mutation_density_z_score:\t$outlier_deviation\n";
	$header_string .= "---\n";
	$header_string .= "\n";
	print $OUTPUT_FILE $header_string;

	#calculate a chromothripsis score for each region that was present in the suspect region array
	#store the results of the score calculation in a 2d array where elements in the first dimension correspond to each suspect region
	#and the elements in the second dimension are the results of the score calculation
	for (my $i = 0; $i < $suspect_regions_size; $i+=3){
		my @region_data = ();
		$region_data[0]   = $suspect_regions[$i];	#chr
		$region_data[1] = $suspect_regions[$i+1];	#start
		$region_data[2]   = $suspect_regions[$i+2]; 	#end

		($region_data[3], $region_data[4], $region_data[5], $region_data[6], $region_data[7], $region_data[8], $region_data[9], $region_data[10], $region_data[11], $region_data[12], $region_data[13]) = calculate_score($region_data[0], $region_data[1], $region_data[2], $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_mutation_density_hash_ref, $genome_trans_insertion_breakpoints_hash_ref, $tp53_mutated, $chromosome_cnv_breakpoints_hash_ref, $chromosome_loh_breakpoints_hash_ref, $bin_size);

		#add the results of the score calculation for this region to the array storing all the results
		push @suspect_region_data, [@region_data];
		}

	#sort the results so that the region with the highest chromothriptic score will be printed
	#to the final report output file first
	@suspect_region_data = sort {$b->[3] <=> $a->[3] } @suspect_region_data;

	#for each score that is generated print the score and the related statistics for that region 
	foreach my $score_data (@suspect_region_data){
		my $chr   = $score_data->[0];	#chr
		my $start = $score_data->[1];	#start
		my $end   = $score_data->[2]; 	#end
		
		my $score			= sprintf("%.5f",$score_data->[3]);
		my $chr_z_score			= $score_data->[4];
		my $region_density		= sprintf("%e",$score_data->[5]);

		my $cnv_number_hash_ref		= $score_data->[6];
		my %cnv_number_hash;
		my $num_copy_num;
		if(defined($cnv_number_hash_ref)){
			%cnv_number_hash = %{$cnv_number_hash_ref};
			$num_copy_num = keys %cnv_number_hash;
			}
		else{
			$num_copy_num = 0;
			}

		my $cnv_density			= sprintf("%e",$score_data->[7]);

		my $intertranslocation_hash_ref	= $score_data->[8];
		my $translocation_density	= $score_data->[9];
		my %intertranslocation_hash;
		my $num_trans_chr;
		if(defined($intertranslocation_hash_ref)){
			%intertranslocation_hash = %{$intertranslocation_hash_ref};
			$num_trans_chr = keys %intertranslocation_hash;
			}
		else{
			$num_trans_chr = 0;
			}

		my $breakpoint_insertions_array_ref	= $score_data->[10];
		my @breakpoint_insertions_array;
		my $breakpoint_percentage;
		if(defined($breakpoint_insertions_array_ref)){
			@breakpoint_insertions_array = @$breakpoint_insertions_array_ref;
			$breakpoint_percentage	= sprintf("%.2f",$breakpoint_insertions_array[2]*100);
			}

		my $loh_size = $score_data->[11];
		my $hz_size = $score_data->[12];
		my $percent_hz_lost;
		if(defined($loh_size) && defined($hz_size)){
			$percent_hz_lost = sprintf("%.2f",($loh_size/$hz_size)*100);
			}

		my @score_array = @{$score_data->[13]};

		my $chr_density = $genome_mutation_density_hash{$chr};

		my $print_string;

		$print_string = "chromosome:\t$chr\n";
		$print_string .= "start:\t\t$start\n";
		$print_string .= "end:\t\t$end\n";
		$print_string .= "\n";

		$print_string .= "final_score:\t\t\t$score\n";
		$print_string .= "cnv_score:\t\t\t".$score_array[0]*$cnv_weight."\n";
		$print_string .= "mutation_density_score:\t\t".$score_array[1]*$mutation_density_weight."\n";
		$print_string .= "genome_localization_score:\t".$score_array[2]*$genome_localization_weight."\n";
		$print_string .= "translocation_score:\t\t".$score_array[3]*$translocation_weight."\n";
		$print_string .= "insertion_breakpoint_score:\t".$score_array[4]*$insertion_breakpoint_weight."\n";
		$print_string .= "loh_score:\t\t\t".$score_array[5]*$loh_weight."\n";
		
		$print_string .= "\n";

		$print_string .= "mutation_density_of_region:\t$region_density\n";
		$print_string .= "mutation_density_of_chromosome:\t$chr_density\n";
		$print_string .= "standard_deviations_from_mean_of_chromosome_mutation_density:\t$chr_z_score\n";
		$print_string .= "\n";

		$print_string .= "density_of_copy_number_switches: $cnv_density\n";
		$print_string .= "number_of_aberrant_copy_number_states:\t$num_copy_num\n";
		if($num_copy_num>0){
			$print_string .= "aberrant_copy_number_states:\n";
			foreach my $key (sort {$cnv_number_hash{$b} <=> $cnv_number_hash{$a} } keys %cnv_number_hash){
				$print_string .= "\t$key:\t$cnv_number_hash{$key}\n";
				}
			}

		$print_string .= "\n";

		$print_string .= "density_of_translocation_breakpoints: $translocation_density\n";
		$print_string .= "number_of_intertranslocational_chromosomes:\t$num_trans_chr\n";

		if($num_trans_chr>0){
			$print_string .= "intertranslocational_chromosomes:\n";
			foreach my $key (sort {$intertranslocation_hash{$b} <=> $intertranslocation_hash{$a} } keys %intertranslocation_hash){
				$print_string .= "\t$key:\t$intertranslocation_hash{$key}\n";
				}
			}
		$print_string .= "\n";

		if(defined($breakpoint_insertions_array_ref)){
			$print_string .= "insertion_data:\n";	
			$print_string .= "\tinsertions_found_at_translocation_breakpoints:\t$breakpoint_insertions_array[0]\n";	
			$print_string .= "\ttotal_translocation_breakpoints:\t$breakpoint_insertions_array[1]\n";	
			$print_string .= "\tpercentage:\t$breakpoint_percentage"."%\n";	
			$print_string .= "\n";
			}

		if($loh_size!=-1){
			$print_string .= "loh_data:\n";
			$print_string .= "\ttotal_size_of_loh:\t$loh_size\n";	
			$print_string .= "\ttotal_size_of_original_heterozygosity:\t$hz_size\n";
			$print_string .= "\tpercent_heterozygosity_lost:\t$percent_hz_lost"."%\n";	
			$print_string .= "\n";
			}

		$print_string .= "tp53_mutation_present:\t$tp53_mutated\n";

		$print_string .= "---\n";
		$print_string .= "\n";
		print $OUTPUT_FILE $print_string;
		$print_string = "";
		}

	print $OUTPUT_FILE "...";
	close($OUTPUT_FILE);
	}#sub analyze_suspect_regions

### analyze_likely_regions ########################################################################
# Description:
#	Generates an output file that lists the regions that have a mutation density that is less
#	than the outlier cut off but greater than 1 - the outlier cut off
#
# Input variables:
#	$output_directory:			stores path to the output directory
#	$likely_regions_array_ref:		reference to array storing the chromosome, start,
#						and end position of highly mutated regions
#	$genome_mutation_density_hash_ref:	stores the average mutation density of each
#						chromosome
#	$genome_cnv_data_hash_ref:		stores the position of CNV mutations on each
#						chromosome
#	$genome_trans_data_hash_ref:		stores the position of translocation events on each
#						chromosome
#	$bin_size:				stores the size of a single bin
#
sub analyze_likely_regions {

	#parse parameters
	my $output_directory 			= shift;

	my $likely_regions_array_ref 		= shift;
	my @likely_regions 			= @$likely_regions_array_ref;

	my $genome_mutation_density_hash_ref 	= shift;
	my $genome_cnv_data_hash_ref 		= shift;
	my $genome_trans_data_hash_ref		= shift;
	my $bin_size 				= shift;

	my $likely_regions_size = @likely_regions;

	my $OUTPUT_FILE;

	my @likely_region_data = ();	#stores start, end, chromsome and mutation density for each region

	#check that the likely region array is not malformed, should contain sets of 3 elements
	if($likely_regions_size % 3 != 0){
		die "ERROR: suspect_regions_array has $likely_regions_size entries. Value must be divisible by 3.\n";
		}


	#create output directory	
	mkdir ("$output_directory"."suspect_regions",0770) unless (-d "$output_directory"."suspect_regions");
	if(!(-e "$output_directory"."suspect_regions")){
		die "ERROR: could not create folder $output_directory"."suspect_regions";
		}

	#create output file
	open ($OUTPUT_FILE, ">", "$output_directory"."suspect_regions/likely_regions.log") or die "ERROR: could not create file: $output_directory"."suspect_regions/likely_regions.log\n";

	#print file header
	print $OUTPUT_FILE "Likely Chromothriptic Regions\n";
	print $OUTPUT_FILE "High Mutation Density Z-Score:\t$outlier_deviation\n";
	print $OUTPUT_FILE "Min Mutation Density Z-Score:\t$outlier_deviation-1\n";
	print $OUTPUT_FILE "---------------------------------------\n";
	print $OUTPUT_FILE "#chr\tstart\tend\tmutation_density";

	#for each likely region calculate the mutation density for the region and store it in the likely_region_data array
	for (my $i = 0; $i < $likely_regions_size; $i+=3){
		my @region_data = ();
		$region_data[0]   = $likely_regions[$i];	#chr
		$region_data[1]   = $likely_regions[$i+1];	#start
		$region_data[2]   = $likely_regions[$i+2]; 	#end

		($region_data[3],$region_data[4]) = calculate_region_mutation_density_score($region_data[0], $region_data[1], $region_data[2], $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_mutation_density_hash_ref, $bin_size);

		push @likely_region_data, [@region_data];
		}

	#sort the regions by density, largest to smallest
	@likely_region_data = sort {$b->[3] <=> $a->[3] } @likely_region_data;

	#print the density for each region to the output file 
	foreach my $i (@likely_region_data){
		my $chr   = $i->[0];	#chr
		my $start = $i->[1];	#start
		my $end   = $i->[2]; 	#end
		
		my $region_density	= $i->[3];	#mutation density

		print $OUTPUT_FILE "\n";
		print $OUTPUT_FILE "$chr\t$start\t$end\t$region_density";
		}

	close($OUTPUT_FILE);
	}#sub analyze_likely_regions

### calculate_score ###############################################################################
# Description:
#	Calculates the chromothripic score for the given region. Calls sub methods to generate the
#	score for each hallmark	
#
# Input variables:
#	$chr:						stores the chromosome on which the region
#							is found	
#	$start:						stores the start base pair of the region
#	$end:						stores the end base pair of the region
#	$genome_cnv_data_hash_ref:			stores the position of CNV mutations on
#							each chromosome
#	$genome_trans_data_hash_ref:			stores the position of translocation events
#							on each chromosome
#	$genome_mutation_density_hash_ref:		stores the average mutation density of each
#							chromosome
#	$genome_trans_insertion_breakpoints_hash_ref:	stores the position of insertions on each
#							chromosome	
#	$tp53_mutated:					stores whether the TP53 gene is mutatated
#							or not
#	$chromosome_cnv_breakpoints_hash_ref:		stores the breakpoints of CNV mutations on
#							each chromosome
#	$chromosome_loh_breakpoints_hash_ref:		stores the breakpoints of LOH regions on
#							each chromosome
#	$bin_size:					stores the size of a single bin
#
sub calculate_score{

	#parse parameters
	my $chr 	= shift;
	my $start 	= shift;
	my $end		= shift;

	my $genome_cnv_data_hash_ref			= shift;
	my $genome_trans_data_hash_ref			= shift;
	my $genome_mutation_density_hash_ref		= shift;
	my $genome_trans_insertion_breakpoints_hash_ref	= shift;
	my $tp53_mutated				= shift;
	my $chromosome_cnv_breakpoints_hash_ref		= shift;
	my $chromosome_loh_breakpoints_hash_ref		= shift;
	my $bin_size					= shift;

	#initialize variable to store scores for each hallmark
	my $cnv_score				= 0;
	my $mutation_density_score		= 0;
	my $genome_localization_score		= 0;
	my $translocation_score			= 0;
	my $insertion_breakpoint_score		= 0;
	my $loh_score				= 0;
	my $final_score				= 0;

	my @score_array;	#array in which hallmark scores will be returned

	my $chr_mutation_density;	#stores the average mutation density of the chromosome where the region is found
	my $chr_z_score;		#stores the z_score of the mutation density of the chromosome where the region is found vs
					#all the other chromosomes
	my $cnv_number_hash_ref;	#stores a hash that contains the number of regions of each abberant copy-number
	my $cnv_density;		#stores the density of cnv mutations in the region
	my $translocation_density;	#stores the density of translocation mutations in the region
	my $mutation_density;		#stores the density of all mutations in the region
	my $intertranslocation_hash_ref;	#stores the number of translocations between all other chromosomes and the region
	my $breakpoint_insertions_array_ref;	#stores the total number of translocation breakpoints, and the number that have insertions nearby
	my $loh_size = -1;			#stores the amount of heterozygosity that was lost in the region
	my $heterozygous_size;			#stores the original amount of heterozygosity in the region
	
	($cnv_score, $cnv_number_hash_ref, $cnv_density) = calculate_copy_number_scores($chr, $start, $end, $genome_cnv_data_hash_ref, $bin_size);
	($genome_localization_score, $chr_z_score, $chr_mutation_density) = calculate_genome_localization_score($chr, $genome_mutation_density_hash_ref);
	($translocation_score, $intertranslocation_hash_ref, $translocation_density) = calculate_translocation_score($chr, $start, $end, $genome_trans_data_hash_ref, $bin_size);

	if(defined($genome_trans_insertion_breakpoints_hash_ref)){
		($insertion_breakpoint_score, $breakpoint_insertions_array_ref)	= calculate_insertion_breakpoint_score($chr, $start, $end, $genome_trans_data_hash_ref, $genome_trans_insertion_breakpoints_hash_ref, $bin_size);
		}

	($mutation_density, $mutation_density_score) = calculate_region_mutation_density_score($chr, $start, $end, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_mutation_density_hash_ref, $bin_size);

	if(defined($chromosome_loh_breakpoints_hash_ref)){
		($loh_score, $loh_size, $heterozygous_size) = calculate_loh_score($chr, $start, $end, $chromosome_cnv_breakpoints_hash_ref, $chromosome_loh_breakpoints_hash_ref);
		}

	#calculate overall score for region based on hallmark weights and scores
	$final_score = ($cnv_score*$cnv_weight) + ($mutation_density_score*$mutation_density_weight) + ($genome_localization_score*$genome_localization_weight) + ($translocation_score*$translocation_weight) + ($insertion_breakpoint_score*$insertion_breakpoint_weight) + ($tp53_mutated*$tp53_mutated_weight) + ($loh_score*$loh_weight);

	#push the hallmark scores into the score array
	push (@score_array, ($cnv_score, $mutation_density_score, $genome_localization_score, $translocation_score, $insertion_breakpoint_score, $loh_score));

	#return the scores and other region statistics
	return ($final_score, $chr_z_score, $mutation_density, $cnv_number_hash_ref, $cnv_density, $intertranslocation_hash_ref , $translocation_density, $breakpoint_insertions_array_ref, $loh_size, $heterozygous_size, \@score_array);
	}#sub calculate_score


### calculate_copy_number_score  ##################################################################
# Description:
#	Calculates the score for the copy-number variation hallmark
#
# Input variables:
#	$chr:				stores the chromsome where the region is located
#	$start:				stores the starting location of the region
#	$end:				stores the end location of the region
#	$genome_cnv_data_hash_ref:	stores the position of CNV mutations on each chromosome
#	$bin_size:			stores the size of single bin
#
sub calculate_copy_number_scores {

	#parse parameters
	my $chr = shift;
	my $start = shift;
	my $end = shift;

	my $genome_cnv_data_hash_ref = shift;
	my %genome_cnv_data_hash = %$genome_cnv_data_hash_ref;

	my $bin_size = shift;

	#calculate array index where the data for the first and last bins of the region are located
	my $start_index = $start / ($bin_size);
	my $end_index = $end / ($bin_size);

	my $cnv_score = 0;	#stores final score to return

	my %cnv_number_hash;	#hash
				#key: copy number eg 0,1,3,4
				#value: the number of regions with the given copy number

	my $cnv_switch_count = 0;	#number of switches between different copy numbers
	my $cnv_switch_density = 0;	#density of cnv events in the region

	my @chr_data;	#stores all the bins for the chromosome where the region is located
	my %cnv_hash;	#stores the cnv hash from each bin

	my $mean = 0;	#stores the average number of regions of aberrant copy-number
	my $SD = 0;	#stores the standard deviation of the number of regions of aberrant copy-number

	my %cnv_significant;		#hash
					#key: copy number (but only significant ones are stored)
					#value: the number of regions with the given copy number

	my $significant_count = 0;	#stores the number of unique significant copy-numbers

	#check if there is cnv data for the chromosome
	if(defined($genome_cnv_data_hash{$chr})){

		#extract the bin data for the chromosome
		@chr_data = @{$genome_cnv_data_hash{$chr}};

		#collect the data from the bins that contain the region
		for (my $i = $start_index; $i < $end_index+1; $i++){
			if(!defined($chr_data[$i])){
				next;
				}
			%cnv_hash = %{$chr_data[$i]};

			for my $key (keys %cnv_hash){
				if($key eq 'BPcount'){
					$cnv_switch_count += $cnv_hash{$key};
					}
				else{
					$cnv_number_hash{$key}+= $cnv_hash{$key};
					}
				}
			}

		#calculate the breakpoint density of cnv mutations for the region 
		$cnv_switch_density = $cnv_switch_count/ ($end-$start);

		#calculate the number of cnv events in the region (half the the number of breakpoints)
		for my $key (keys %cnv_number_hash){
			$cnv_number_hash{$key} = POSIX::ceil($cnv_number_hash{$key});
			$mean += $cnv_number_hash{$key};
			}

		#if no cnv mutations were found return a score of 0
		if(scalar(keys %cnv_number_hash)==0){
			$cnv_score = 0;
			return ($cnv_score, \%cnv_number_hash, $cnv_switch_density);
			}

		#calculate the mean of the number regions of each copy-number
		$mean = $mean/(scalar(keys %cnv_number_hash));

		#calculate the standard deviation of the number of regions of each copy-number
		for my $key (keys %cnv_number_hash){
			$SD += ($cnv_number_hash{$key}-$mean)**2;
			}
		$SD = $SD/(scalar(keys %cnv_number_hash));
		$SD = $SD**0.5;

		#determine which copy-numbers are significant (ie are not low out liers)
		for my $key (keys %cnv_number_hash){
			if(
			   ( $SD==0 )||
			   ( (($cnv_number_hash{$key}-$mean)/$SD) >= -1*$outlier_deviation )
			  ){
				$cnv_significant{$key} = $cnv_number_hash{$key};
				$cnv_score = $cnv_score + $cnv_significant{$key}**2;	
				}
			}

		#score calculation
		$cnv_score = $cnv_score / (scalar(keys %cnv_significant));
		$cnv_score = log($cnv_score)/log(2);
		$cnv_score += 1;
		$cnv_score = 1 - (1/$cnv_score);
		$cnv_score = $cnv_score/(scalar(keys %cnv_significant));
		}

	return ($cnv_score, \%cnv_number_hash, $cnv_switch_density);

	}#sub calculate_copy_number_scores

### calculate_genome_localization_score  ##########################################################
# Description:
#	Calculates the genome localization hallmark score
#
# Input variables:
#	$chr:					store the chromosome where the region is located
#	$genome_mutation_density_hash_ref:	stores the average mutation density of each 
#						chromosome
#
sub calculate_genome_localization_score {

	#parse parameters
	my $chr = shift;

	my $genome_mutation_density_hash_ref = shift;
	my %genome_mutation_density_hash = %{$genome_mutation_density_hash_ref};

	#read mutation density for the chromosome
	my $chr_mutation_density = $genome_mutation_density_hash{$chr}; 

	my $mean_density = 0;	#stores the average density of mutations for the chromosomes

	my $standard_deviation = 0;	#stores the standard deviation of the mutation densities of the chromosomes

	my $z_score = 0;	#stores the z-score for the suspect chromosome
	my $p_val = 0;		#stores the p-value calculated from the z-score for the suspect chromosome

	#sum mutation densities of all the chromosomes
	for my $key (keys %genome_mutation_density_hash){
		$mean_density += $genome_mutation_density_hash{$key}/$chromosome_length{$chr};
		}

	#calculate the mean
	$mean_density = $mean_density / 24;

	#calculate the standard deviation	
	for my $key (keys %genome_mutation_density_hash){
		$standard_deviation += ((($genome_mutation_density_hash{$key}) - ($mean_density) )**2);
		}
	$standard_deviation = $standard_deviation / 24;
	$standard_deviation = ($standard_deviation)**0.5;

	#check for case where the standard deviation or mean comes back as 0
	if($mean_density == 0 || $standard_deviation == 0){
		return (0, 0, $chr_mutation_density);
		}

	#calculate the z-score for suspect chromosome
	$z_score = ($chr_mutation_density - $mean_density) / $standard_deviation;

	#calculate the p-value from the z-score
	$p_val = Statistics::Distributions::uprob($z_score);
	#only consider top tail
	$p_val = 0.5-$p_val;
	$p_val = $p_val/0.5;

	#check for case where z-score comes back as 0	
	if($z_score < 0){
		$p_val = 0;
		}

	#p_val is the score for this hallmark
	return ($p_val, $z_score, $chr_mutation_density);

	}#sub calculate_genome_localization_score

### calculate_region_mutation_density_score  ######################################################
# Description:
#	Calculates the chromosome localization hallmark score	
#
# Input variables:
#	$chr:					chromosome where the region is located
#	$start:					starting location of the region
#	$end:					end location of the region
#	$genome_cnv_data_hash_ref:		stores the position of CNV mutations on each 
#						chromosome	
#	$genome_trans_data_hash_ref:		stores the position of translocation events
#						on each chromosome	
#	$genome_mutation_density_hash_ref:	stores the average mutation density of each
#						chromosome
#	$bin_size:				stores the size of single bin
#
sub calculate_region_mutation_density_score {

	#parse parameters
	my $chr = shift;
	my $start = shift;
	my $end = shift;

	my $genome_cnv_data_hash_ref = shift;
	my %genome_cnv_data_hash = %{$genome_cnv_data_hash_ref};

	my $genome_trans_data_hash_ref = shift;
	my %genome_trans_data_hash = %{$genome_trans_data_hash_ref};

	my $genome_mutation_density_hash_ref = shift;
	my %genome_mutation_density_hash = %{$genome_mutation_density_hash_ref};

	my $bin_size = shift;

	#calculate array index where the data for the first and last bins of the region are located
	my $start_index = $start / $bin_size;
	my $end_index = $end / $bin_size;

	#get the mean mutation density for the suspect chromosome
	my $mean_chr_mutation_density = $genome_mutation_density_hash{$chr}; 

	my $chis_stat;	#stores the chi squared statistic value

	my $mutation_count = 0;		#stores the total mutation count in the region
	my $mutation_density = 0;	#store the mutation density of the region
	my $mutation_density_score;	#stores the final score for the hallmark

	my @chr_data;	#stores the bin data for the chromosome
	my %cnv_hash;	#stores the cnv hash for each bin
	my %trans_hash;	#stores the translocation hash for each bin

	#check that there is cnv data for the chromosome
	if(defined($genome_cnv_data_hash{$chr})){
		#get the cnv data for the chromosome
		@chr_data = @{$genome_cnv_data_hash{$chr}};

		#sum the number of cnv breakpoints in the suspect region
		for (my $i = $start_index; $i < $end_index+1; $i++){
			if(!defined($chr_data[$i])){
				next;
				}

			%cnv_hash = %{$chr_data[$i]};
			$mutation_count += $cnv_hash{'BPcount'};
			}
		}

	@chr_data = ();

	#check that there is translocation data for the chromosome
	if(defined($genome_trans_data_hash{$chr})){
		#get the translocation data for the chromosome
		@chr_data = @{$genome_trans_data_hash{$chr}};

		#sum the number of translocation breakpoints in the suspect region
		for (my $i = $start_index; $i < $end_index+1; $i++){
			if(!defined($chr_data[$i])){
				next;
				}
			%trans_hash = %{$chr_data[$i]};
			$mutation_count += $trans_hash{'BPcount'};
			}
		}

	#divide the count by 2 to get the number of events	
	$mutation_count = POSIX::ceil($mutation_count/2);

	#calculate the mutation density for the region
	$mutation_density = $mutation_count / ($end-$start);

	#calculate the chi squared statistic
	$chis_stat = (($mutation_density-$mean_chr_mutation_density)**2)/($mean_chr_mutation_density);

	#generate a p-value using the chi squared test
	$mutation_density_score = 1-(Statistics::Distributions::chisqrprob(1,$chis_stat));

	return ($mutation_density, $mutation_density_score);
	}#calculate_region_mutation_density_score

### calculate_translocation_score #################################################################
# Description:
#	Calculates the translocation hallmark score 
#
# Input variables:
#	$chr:				chromosome where the region is located
#	$start:				starting location of the region
#	$end:				end location of the region
#	$genome_trans_data_hash_ref:	stores the position of translocation events on each
#					chromosome
#	$bin_size:			stores the size of single bin
#
sub calculate_translocation_score {

	#parse parameters
	my $chr = shift;
	my $start = shift;
	my $end = shift;

	my $genome_trans_data_hash_ref = shift;
	my %genome_trans_data_hash = %{$genome_trans_data_hash_ref};

	my $bin_size = shift;

	#calculate array index where the data for the first and last bins of the region are located
	my $start_index = $start / ($bin_size);
	my $end_index = $end / ($bin_size);

	my $translocation_density = 0;	#stores the density of translocation events in the region

	my @chr_data;		#stores the bin data for the chromosome
	my %trans_breakpoints;	#hash
				#key: chromosome eg 1,2,X,Y
				#value: an array storing the position of the translocation breakpoints

	my %trans_breakpoint_spreads;	#stores the average distance between translocation breakpoints
	my @significant_chrs;		#stores a list of chromosomes that have a significant number of
					#translocation to or from the region
	my @diffs;			#stores the distance between adjacent translocation breakpoints on
					#one chromosome
	my $diff_sum;			#stores the sum of the distances
	my $diff_count;			#store the number regions between translocation breakpoints

	my %trans_number_hash;		#hash
					#key: chromosome eg 1,2,X,Y
					#value: the number of events between the chromosome and the region

	my $mean = 0;			#stores the average number of translocations between each chromosome
					#and the region
	my $SD = 0;			#stores the standard deviation of the above value

	my $weighted_sum = 0.00;	#component of the score calculation

	my $size = 0.00;		#intermediate variable used to collect sums
	my $count = 0;			#intermeidate variable

	my $translocation_score = 0;	#final hallmark score

	my $spread_factor = 0;

	#check that there is translocation data for the chromosome
	if(defined($genome_trans_data_hash{$chr})){
		#get the translocation data for the chromosome
		@chr_data = @{$genome_trans_data_hash{$chr}};

		#for each bin sum the number of translocation events
		#and record the position of breakpoints in the trans_breakpoints hash
		for (my $i = $start_index; $i < $end_index+1; $i++){
			if(!defined($chr_data[$i])){
				next;
				}
			my %trans_hash = %{$chr_data[$i]};
			my %trans_hash_in;
			my %trans_hash_out;

			#analyze the breakpoints from translocation into the region
			if(defined($trans_hash{'in'})){
				%trans_hash_in = %{$trans_hash{'in'}};

				for my $key (keys %trans_hash_in){
					$size = @{$trans_hash_in{$key}};

					#calculate the number of translocation events
					$size = $size/2;

					#add the count to the appropriate hash
					$trans_number_hash{$key} += $size;

					#add the breakpoints to the trans_breakpoints hash
					push(@{$trans_breakpoints{$key}},(@{$trans_hash_in{$key}}));
					}
				}

			#analyze the breakpoints from translocation out of the region
			if(defined($trans_hash{'out'})){
				%trans_hash_out = %{$trans_hash{'out'}};

				for my $key (keys %trans_hash_out){
					$size = @{$trans_hash_out{$key}};
					$count = $size;
					if($key eq $chr){
						foreach my $val (@{$trans_hash_out{$key}}){
							if($val > $start && $val < $end){
								$count--;
								}
							}
						}
					#calculate the number of translocation events
					$count = $count/2;

					#add the count to the appropriate hash
					$trans_number_hash{$key} += $count;

					#add the breakpoints to the trans_breakpoints hash
					push(@{$trans_breakpoints{$key}},(@{$trans_hash_out{$key}}));
					}
				}
			}

		$count = 0;

		#check that some translocation events were found else return a score of 0
		if (keys(%trans_number_hash) == 0){
			$translocation_score = 0;
			$translocation_density = 0;
			return ($translocation_score, \%trans_number_hash, $translocation_density);
			}

		for my $key (keys %trans_number_hash){
			#round the event count up since if only one breakpoint was present at the end
			#of the region we still count that as a whole translocation event
			$trans_number_hash{$key} = POSIX::ceil($trans_number_hash{$key});

			#sum the number of translocation events in the region
			$count += $trans_number_hash{$key};
			}

		#calculate the translocation density for the region
		$translocation_density = $count / ($end-$start);

		#calculate the mean and standard deviation of the number of translocation between the region
		#and each chromosome
		($SD, $mean) = standard_deviation_and_mean(\%trans_number_hash,0);
	
		#identify chromosomes that have a high number of translocations to or from the region and
		#add them to the significant chromosome list
		for my $key (keys %trans_number_hash){
			if(
			   ( $SD==0 )||
			   ( (($trans_number_hash{$key}-$mean)/$SD)>-2*$outlier_deviation)
			  ){
				push (@significant_chrs, $key);
				}
			}

		#for each significant chromosome calculate the spread between translocation events
		foreach my $key (@significant_chrs){
			#sort the breakpoints
			@{$trans_breakpoints{$key}} = sort {$a <=> $b} @{$trans_breakpoints{$key}};

			$size = @{$trans_breakpoints{$key}};
			@diffs = ();
			$diff_sum = 0;
			$diff_count = 0;

			#calculate and store the distance between adjacent breakpoints
			for (my $i = 1; $i<$size; $i++){
				push (@diffs, @{$trans_breakpoints{$key}}[$i]-@{$trans_breakpoints{$key}}[$i-1]);
				}

			#check that more that one distance was calculated
			if($size==1){
				$trans_breakpoint_spreads{$key} = 0;
				$diff_count = 1;
				}
			else{	#calculate the standard deviation and mean for the distance between breakpoints
				($SD,$mean) = standard_deviation_and_mean(\@diffs,1);

				#sum the distances that are not high outliers, indicating distance between 2 translocation
				#events and not distance between breakpoints of the same event
				foreach my $val (@diffs){
					if(
					   ( $SD==0 )||
					   ( (($val-$mean)/$SD)<$outlier_deviation )
					  ){
						$diff_sum += $val;
						$diff_count++;
						}
					}
				}

			#calculate the average spread of translocation breakpoints
			$trans_breakpoint_spreads{$key} = $diff_sum / $diff_count;

			#calculate the spread factor for the chromosome
			#my $spread_factor = (log($trans_breakpoint_spreads{$key}+1)/log(10))/((log($expected_mutation_density)/log(10))*-1);
			$spread_factor = (log($trans_breakpoint_spreads{$key}+1)/log(10))/((log($expected_mutation_density)/log(10))*-1);
			if($spread_factor==0){
				$spread_factor = 1;
				}

			#increase the weighted sum based on the number of translocation events and their spread
			$weighted_sum += $trans_number_hash{$key}/$spread_factor;	
			}#foreach my $key (@significant_chrs)

		#final hallmark score calculation
		$size = @significant_chrs;
		$translocation_score = (1/(1+(log($size)/log(4))))*(1-(1/(log(1+$weighted_sum)/log(2))));

		if($translocation_score>=1){
			print "score: ".$translocation_score."\n";
			print "ws: ".$weighted_sum."\n";
			print "sf: ".$spread_factor."\n";
			print "term 1: ";
			print (1/(1+(log($size)/log(4))));
			print "\n";
			print "term 2: ";
			print (1-(1/(log($weighted_sum)/log(2))));
			print "\n";
			}

		}#if(defined($genome_trans_data_hash{$chr}))

	return ($translocation_score, \%trans_number_hash, $translocation_density);

	}#sub calculate_translocation_score

### calculate_insertion_breakpoint_score ##########################################################
# Description:
#	Calculates the insertions at translocation breakpoints hallmark score	
#
# Input variables:
#	$chr:						chromosome where the region is located
#	$start:						starting location of the region
#	$end:						end location of the region
#	$genome_trans_data_hash_ref:			stores the position of translocation events 
#							on each chromosome
#	$genome_trans_insertion_breakpoints_hash_ref:	stores the position of breakpoints with
#							insertions nearby
#	$bin_size:					stores the size of single bin
#
sub calculate_insertion_breakpoint_score {

	#parse parameters
	my $chr = shift;
	my $start = shift;
	my $end = shift;

	my $genome_trans_data_hash_ref = shift;
	my %genome_trans_data_hash = %{$genome_trans_data_hash_ref};

	my $genome_trans_insertion_breakpoints_hash_ref = shift;
	my %genome_trans_insertion_breakpoints_hash = %{$genome_trans_insertion_breakpoints_hash_ref};

	my $bin_size = shift;

	#calculate array index where the data for the first and last bins of the region are located
	my $start_index = $start / ($bin_size);
	my $end_index = $end / ($bin_size);

	my $total_breakpoints = 0;	#total number of breakpoints in the region
	my $inserted_breakpoints = 0;	#total number of breakpoints with nearby insertions in the region

	my $insertion_breakpoint_score = 0;

	my @chr_data;		#stores the bin data for the chromosome
	my %trans_hash;		#stores translocation hash for each bin

	my @inserted_breakpoint_list;	#stores the breakpoints that have insertions nearby
	my @breakpoint_data;		#stores $total_breakpoints and $inserted_breakpoints for return

	#check if there is translocation data for the region
	if(defined($genome_trans_data_hash{$chr})){

		#get the translocation data
		@chr_data = @{$genome_trans_data_hash{$chr}};

		#for each bin in the region sum the number of breakpoints
		for (my $i = $start_index; $i < $end_index+1; $i++){
			if(!defined($chr_data[$i])){
				next;
				}
			%trans_hash = %{$chr_data[$i]};
			$total_breakpoints += $trans_hash{'BPcount'};
			}

		#get the list of breakpoints with insertions nearby on the chromosome
		@inserted_breakpoint_list = @{$genome_trans_insertion_breakpoints_hash{$chr}};

		#sort the above list
		@inserted_breakpoint_list = sort {$a <=> $b} @inserted_breakpoint_list;

		#calculate how many of the breakpoints with insertions are in the region
		foreach my $breakpoint (@inserted_breakpoint_list){
			if($breakpoint > $end){
				last;
				}
			if($breakpoint > $start){
				$inserted_breakpoints++;
				}
			}

		#calculate the hallmark score
		$insertion_breakpoint_score = $inserted_breakpoints/$total_breakpoints;

		if($insertion_breakpoint_score > 1){
			die "ERROR: found a insertion_breakpoint_score greater than 1\n";
			}

		push (@breakpoint_data, $inserted_breakpoints);
		push (@breakpoint_data, $total_breakpoints);
		}

	return ($insertion_breakpoint_score, \@breakpoint_data);
	}

### calculate_loh_score ###########################################################################
# Description:
#	Calculates the loss of heterozgozity hallmark score	
#
# Input variables:
#	$chr:					chromosome where the region is located
#	$start:					starting location of the region
#	$end:					end location of the region
#	$chromosome_cnv_breakpoints_hash_ref:	stores the breakpoints of CNV mutations on each
#						chromosome	
#	$chromosome_loh_breakpoints_hash_ref:	stores the breakpoints of LOH regions on each
#						chromosome
#
sub calculate_loh_score {

	#parse parameters
	my $chr		= shift;
	my $start	= shift;
	my $end		= shift;

	my $chromosome_cnv_breakpoints_hash_ref	= shift;
	my %chromosome_cnv_breakpoints_hash	= %{$chromosome_cnv_breakpoints_hash_ref};

	my $chromosome_loh_breakpoints_hash_ref	= shift;
	my %chromosome_loh_breakpoints_hash	= %{$chromosome_loh_breakpoints_hash_ref};

	my @cnv_breakpoints;		#stores cnv breakpoints in the region
	my $cnv_breakpoints_size;	#stores the number of cnv breakpoints in the region

	my @loh_breakpoints;		#stores the LOH breakpoints in the region
	my $loh_breakpoints_size;	#stores the number of LOH breakpoints in the region

	#calculate maximum potential amount of heterozygosity
	my $original_heterozygous_size = $end - $start;

	#calculate maximum pontentail amount of heterozygosity that can remain
	my $remaining_heterozygous_size 	= $end - $start;

	my $loh_size = 0;	#stores the size of all LOH regions in the region

	my $loh_score = 0;	#final hallmark score

	#check if there is any cnv data for the chromosome	
	if(
	   ( !defined($chromosome_cnv_breakpoints_hash{$chr}) ) ||
	   ( !defined($chromosome_loh_breakpoints_hash{$chr}) )
	  ){
		$loh_score = 0;
		$loh_size = -1;
		$remaining_heterozygous_size = -1;
		return($loh_score, $loh_size, $remaining_heterozygous_size);
		}

	#get the cnv breakpoints for the chromosome
	@cnv_breakpoints = @{$chromosome_cnv_breakpoints_hash{$chr}};

	#sort the list
	@cnv_breakpoints = sort {$a <=> $b} @cnv_breakpoints;

	#get the number of breakpoints
	$cnv_breakpoints_size = @cnv_breakpoints;

	#find all the cnv events that occur in the region and subtract the size of these regions
	#from the $original_heterozygous_size value
	for (my $i = 0; $i< $cnv_breakpoints_size; $i+=2){
		my $cnv_start = $i;
		my $cnv_end = $i+1;

		my $end_overlap = 0;
		my $start_overlap = 0;

		if($cnv_breakpoints[$cnv_start] > $end){
			last;
			}

		#Check if the end point of the cnv region is within the suspect region
		if($cnv_breakpoints[$cnv_end] >= $start && $cnv_breakpoints[$cnv_end] <= $end){
			$end_overlap = 1;
			}

		#Check if the start point of the region cnv is within the suspect region
		if($cnv_breakpoints[$cnv_start] >= $start && $cnv_breakpoints[$cnv_start] <= $end){
			$start_overlap = 1;
			}

		#If an overlap was detected
		if($start_overlap==1 && $end_overlap==1) {
			$remaining_heterozygous_size -= ($cnv_breakpoints[$cnv_end] - $cnv_breakpoints[$cnv_start]);
			}
		elsif($start_overlap==1){
			$remaining_heterozygous_size -= ($end-$cnv_breakpoints[$cnv_start]);
			}
		elsif($end_overlap==1){
			$remaining_heterozygous_size -= ($cnv_breakpoints[$cnv_end]-$start);
			}
		elsif($cnv_breakpoints[$cnv_start] < $start && $cnv_breakpoints[$cnv_end] > $end){
			$remaining_heterozygous_size = 0;
			}
		}

	#check if there is no potential heterozygous regions in the suspect region or if there are no cnv events
	#in the suspect region, if either is the case return a score of 0
	if($remaining_heterozygous_size == 0 || $remaining_heterozygous_size == $original_heterozygous_size){
		$loh_score = 0;
		$loh_size = -1;
		$remaining_heterozygous_size = -1;
		return($loh_score, $loh_size, $remaining_heterozygous_size);
		}
	
	#get a list of the LOH breakpoints on the chromosome	
	@loh_breakpoints = @{$chromosome_loh_breakpoints_hash{$chr}};

	#sort the list
	@loh_breakpoints = sort {$a <=> $b} @loh_breakpoints;

	#get the number of breakpoints
	$loh_breakpoints_size = @loh_breakpoints;

	#determine which LOH regions are in the suspect region
	for (my $i = 0; $i< $loh_breakpoints_size; $i+=2){
		my $start_overlap_region_loh = 0;
		my $end_overlap_region_loh = 0;

		my $loh_start = $i;
		my $loh_end = $i+1;

		my $loh_start_breakpoint = $loh_breakpoints[$loh_start];
		my $loh_end_breakpoint = $loh_breakpoints[$loh_end];
		my $loh_region_size;

		if($loh_breakpoints[$loh_start] > $end){
			last;
			}

		#Check if the end point of the cnv region is within the loh region
		if($loh_breakpoints[$loh_end] >= $start && $loh_breakpoints[$loh_end] <= $end){
			$end_overlap_region_loh = 1;
			}

		#Check if the start point of region 1 is within region 2
		if($loh_breakpoints[$loh_start] >= $start && $loh_breakpoints[$loh_start] <= $end){
			$start_overlap_region_loh = 1;
			}

		#If an overlap was detected
		if($start_overlap_region_loh==1 && $end_overlap_region_loh!=1){
			$loh_end_breakpoint = $end;	
			}
		elsif($end_overlap_region_loh==1 && $start_overlap_region_loh!=1){
			$loh_start_breakpoint = $start;
			}
		elsif($loh_breakpoints[$loh_start] < $start && $loh_breakpoints[$loh_end] > $end){
			$loh_start_breakpoint = $start;
			$loh_end_breakpoint = $end;
			$start_overlap_region_loh=1;
			$end_overlap_region_loh=1;
			}
		if($start_overlap_region_loh != 1 && $end_overlap_region_loh != 1){	#if the loh region is not in the suspect region go to the next loh region
			next;
			}

		#if the loh region is in the suspect region then reduce the size of the loh by subtracting the size of cnv regions that over lap with it
		#this will tell us how much original heterozygosity remains
		$loh_region_size = $loh_end_breakpoint - $loh_start_breakpoint;

		#check for overlaps between the LOH region and cnv regions
		for (my $k = 0; $k< $cnv_breakpoints_size; $k+=2){   
			my $start_overlap_loh_cnv = 0;
			my $end_overlap_loh_cnv = 0;

			my $cnv_start = $k;
			my $cnv_end   = $k+1;

			if($cnv_breakpoints[$cnv_start] > $loh_end_breakpoint){
				last;
				}

			#Check if the end point of the cnv region is within the loh region
			if($cnv_breakpoints[$cnv_end] >= $loh_start_breakpoint && $cnv_breakpoints[$cnv_end] <= $loh_end_breakpoint){
				$end_overlap_loh_cnv = 1;
				}

			#Check if the start point of region 1 is within region 2
			if($cnv_breakpoints[$cnv_start] >= $loh_start_breakpoint && $cnv_breakpoints[$cnv_start] <= $loh_end_breakpoint){
				$start_overlap_loh_cnv = 1;
				}

			#If an overlap was detected
			if($start_overlap_loh_cnv==1 || $end_overlap_loh_cnv==1) {
				$loh_region_size -= ($cnv_breakpoints[$cnv_end] - $cnv_breakpoints[$cnv_start]);
				}
			elsif($start_overlap_loh_cnv==1){
				$loh_region_size -= ($loh_end_breakpoint-$cnv_breakpoints[$cnv_start]);
				}
			elsif($end_overlap_loh_cnv==1){
				$loh_region_size -= ($cnv_breakpoints[$cnv_end]-$loh_start_breakpoint);
				}
			elsif($cnv_breakpoints[$cnv_start] < $loh_start_breakpoint && $cnv_breakpoints[$cnv_end] > $loh_end_breakpoint){
				$loh_region_size = 0;
				}
			}
		$loh_size += $loh_region_size;		
		}


	#calculate the LOH score
	$loh_score = 1 - ($loh_size/$remaining_heterozygous_size);

	if($loh_size> $remaining_heterozygous_size){
		die "ERROR: invalid LOH size value found\n";
		}

	return($loh_score, $loh_size, $remaining_heterozygous_size);

	}#sub calculate_loh_score

### standard_deviation_and_mean ###################################################################
# Description:
#	Calculates the standard deviation and mean for a given set of values	
#
# Input variables:
#	$data_ref:	reference to either a hash or an array
#	$type:		0 indicates a hash, 1 indicates an array
#	
sub standard_deviation_and_mean{

	#parse parameters
	my $data_ref = shift;
	my $type = shift;

	my %hash;
	my @array;
	my $size;

	my $mean = 0;
	my $SD = 0;

	if($type==0){
		%hash = %{$data_ref};

		if((scalar(keys %hash))==0){
			die"Found sample size of 0 when calculating SD-hash\n";
			}

		#calculate mean
		for my $key (keys %hash){
			$mean += $hash{$key};
			}
	
		$mean = $mean/(scalar(keys %hash));

		#calculate sum of squared differences
		for my $key (keys %hash){
			$SD += ($hash{$key}-$mean)**2;
			}
		
		#calculate final standard deviation value
		$SD = $SD/(scalar(keys %hash));
		$SD = $SD**0.5;
	}
	elsif($type==1){
		@array = @{$data_ref};
		$size = @array;

		if($size==0){
			die"Found sample size of 0 when calculating SD-array\n";
			}
	
		#calculate mean
		foreach my $val (@array){
			$mean += $val;
			}

		$mean = $mean/$size;

		#calculate sum of squared differences
		foreach my $val (@array){
			$SD += ($val-$mean)**2;
			}

		#calculate final standard deviation value
		$SD = $SD/$size;
		$SD = $SD**0.5;
		
		}
	else{
		die"ERROR: invalid SD/mean type found\n";
		}
	return ($SD, $mean);

	}#sub standard_deviation_and_mean


### next_arg ######################################################################################
# Parse the next arguement from the command line 
#
sub next_arg {
	my $code = shift;
	$pos++;	
	if($pos == $ARGC){
		usage($code);
		}
	}#sub next_arg

### man_text ######################################################################################
# Print the manual help text
#
sub man_text {
	print "Main Usage:\n";
	print "\tperl -w shatterproof.pl --cnv <dir> --trans <dir> [--insrt <dir>] [--loh <dir>] [--tp53] --config <path> --output <dir>\n";
	print "\n";
	print "\tArguments:\n";
	print "\t\t--cnv\t\tDefine the path to the directory containing the CNV input files\n";
	print "\t\t--trans\t\tDefine the path to the directory containing the Translocation input files\n";
	print "\t\t--insrt\t\tDefine the path to the directory containing the insertion VCF input files\n";
	print "\t\t--loh\t\tDefine the path to the directory containing the LOH input files\n";
	print "\t\t--tp53\t\tIndicate that TP53 should be considered mutated, regardless of data\n";
	print "\t\t--config\tDefine the path to the ShatterProof config file\n";
	print "\t\t--output\tDefine the path to the directory where output should be placed\n";
	print "\t\tdir\t\tPath to a directory\n";
	print "\t\tpath\t\tPath to a file\n";
	print "\n";
	print "Help Usage:\n";
	print "\tperl -w shatterproof.pl --help\t\tThis help message.\n";
	print "\n";
	exit 0;
	}#sub man_text

### usage #########################################################################################
# Prints an error message when invalid command line arguements are found
#
sub usage {
	my $usage_msg = shift;

	print "u $usage_msg \n";
	
	switch($usage_msg){
		case 0 	{ print "ERROR: missing arguments\n"; 			}
		case 1 	{ print "ERROR: 2nd argument missing\n"; 		}
		case 2 	{ print "ERROR: CNV directory missing\n"; 		}
		case 3 	{ print "ERROR: --trans option missing\n"		}
		case 4 	{ print "ERROR: --cnv option missing\n"			}
		case 5 	{ print "ERROR: Translocation directory missing\n"	}
		case 6 	{ print "ERROR: --config option missing\n"		}
		case 7 	{ print "ERROR: --trans option missing\n"		}
		case 8 	{ print "ERROR: insertion directory missing\n"		}
		case 9 	{ print "ERROR: --config option missing\n"		}
		case 10 { print "ERROR: LOH directory missing \n"		}
		case 11 { print "ERROR: --config option missing\n"		}
		case 12 { print "ERROR: --config option missing\n"		}
		case 13 { print "ERROR: Path to config file missing\n"		}
		case 14 { print "ERROR: --output option missing\n"		}
		case 15 { print "ERROR: --config option missing\n"		}
		case 16 { print "ERROR: Output directory missing\n"		}
		case 17 { print "ERROR: --output option missing\n"		}
		case 18 { print "ERROR: too many arguments\n"			}
		}
	print "Try perl -w general_pipeline_tool.pl --help\n";
	exit 0;
	}#sub usage

### initialize_genome_hash ########################################################################
# Description:
#	Initializes a hash to store an array for each chromosome
#
sub initialize_genome_hash {

	my %genome_region_data = (  	#{chr}[region_num]->%region_data
			 X => [],
			 Y => [],
			 1 => [],
			 2 => [],
			 3 => [],
			 4 => [],
			 5 => [],
			 6 => [],
			 7 => [],
			 8 => [],
			 9 => [],
			 10 => [],
			 11 => [],
			 12 => [],
			 13 => [],
			 14 => [],
			 15 => [],
			 16 => [],
			 17 => [],
			 18 => [],
			 19 => [],
			 20 => [],
			 21 => [],
			 22 => []
			 );

	return (\%genome_region_data);
	}#sub initialize_genome_hash

### load_config_file #########################################################################################
# Description:
#	Opens the config file and reads the parameter values from it	
#
# Input variables:
#	$path:	path to config file
#
sub load_config_file {
            
        my $path = shift;
        print "\nLoading configuration file";

        #Load the configuration file config.pl
        my $CONFIG;
        open($CONFIG, "<","$path") or die "COULD NOT OPEN CONFIG FILE at path: $path \n";
        eval (<$CONFIG>) while (!eof($CONFIG));
        close($CONFIG);

        print " - Done\n";
        }#sub load_config_file

1;

=head1 NAME

ShatterProof - a script for analyzing next-generation sequencing data

=head1 DESCRIPTION

TODO FILL ME IN

=head1 README

TODO FILL ME IN
If there is any text in this section, it will be extracted into
a separate README file.

=head1 PREREQUISITES
strict;
warnings;
Carp;
Switch;
File::Basename;
List::Util qw[min max];
Statistics::Distributions;

=pod OSNAMES

any

=pod SCRIPT CATEGORIES

CPAN

=cut
