#!/usr/bin/perl
#
# This software has been created by Genome Research Limited (GRL).
# GRL hereby grants permission to use, copy, modify and distribute
# this software and its documentation for non-commercial purposes
# without fee at the user's own risk on the basis set out below. GRL
# neither undertakes nor accepts any duty whether contractual or
# otherwise in connection with the software, its use or the use
# of any derivative, and makes no representations or warranties,
# express or implied, concerning the software, its suitability,
# fitness for a particular purpose or non-infringement. In no event
# shall the authors of the software or GRL be responsible or liable
# for any loss or damage whatsoever arising in any way directly or
# indirectly out of the use of this software or its derivatives,
# even if advised of the possibility of such damage. Our software
# can be freely distributed under the conditions set out above,
# and must contain this copyright notice.
#

# penncnv_to_plink.pl : convert PennCNV output file to
# PLINK-format CNV list file.
#
# See http://pngu.mgh.harvard.edu/~purcell/plink/cnv.shtml
#
# $Id$
#
# Matthew Gillman, WTSI, 28th June 2010
#
# PennCNV files have the form:
# chr3:192548086-192552678     numsnp=8      length=4,593       state1,cn=0
# sample001.txt startsnp=rs6444540 endsnp=rs297396 conf=26.817
#
# PLINK CNV list file has:
# FID     Family ID         # Need both FID and IID to
# IID     Individual ID     # uniquely specify an individual.
# CHR     Chromosome
# BP1     Start position (base-pair)
# BP2     End position (base-pair)
# TYPE    Type of variant, e.g. 0,1 or 3,4 copies
# SCORE   Confidence score associated with variant
# SITES   Number of probes in the variant
#

use strict;
use warnings;
use Getopt::Long;

#
# Options for specifying families:
# 1. Do not specify; set each sample's family ID to sample ID.
# 2. Set all samples to the same (common) family ID, $comfam.
# 3. Supply a family file ($famfile) which gives each sample's family.
#
my ($infile, $outfile, $famfile, $comfam);

my %famdat = (); # If used: K = individual ID (sample ID); V = its family ID.

&process_command_line_arguments();

if ( $famfile ) {
	&read_famfile();
}

&create();



###################################################
#
# sub create()
#
# Read in the PennCNV file and create the PLINK file.
#
###################################################

sub create {

	open ( IN, $infile ) or die $!;
	open ( OUT, "> $outfile" ) or die $!;

	print OUT "FID\tIID\tCHR\tBP1\tBP2\tTYPE\tSCORE\tSITES\n";

	my ($fid, $iid, $chr, $bp1, $bp2, $type, $score, $sites);

	while ( my $line = <IN> ) {
		
		chomp($line);

		my ($posstr, $numprobestr, $lengthstr, $statestr, $samplestr, $startstr, $endstr, $confstr);
		my $unused;
		my @junk = ();

		# These lines assume the input file is correctly formatted.
		# Let's hope they are.
		if ( $line =~ /conf/ ) {
			($posstr, $numprobestr, $lengthstr, $statestr, $samplestr, $startstr,
			 $endstr, $confstr, @junk) = split /\s+/, $line, 9;
		}	
		else {
			($posstr, $numprobestr, $lengthstr, $statestr, $samplestr, $startstr,
			 $endstr, @junk) = split /\s+/, $line, 8;
		}

		#
		# $posstr : e.g. chr1:147305744-147427061
		#
		my @posdat = split /:/, $posstr;
		my $chrdat = $posdat[0]; # e.g. "chr1"
		my @chars = split (//, $chrdat);
		for ( my $s = 1; $s <= 3; $s++ ) {
			shift(@chars); 
		}

		$chr = join("", @chars);
			
		# if ( $chr eq "X" ) { $chr = 23; }
		# elsif ( $chr eq "Y" ) { $chr = 24; }
		# elsif ( $chr eq "XY" ) { $chr = 25; }
		# elsif ( $chr eq "MT" ) { $chr = 26; }
			
		my $bpdat = $posdat[1];
		($bp1, $bp2) = split /-/, $bpdat;
			
					
		#
		# $numprobestr : e.g. numsnp=8
		#
		($unused, $sites) = split /=/, $numprobestr;

		#
		# $statestr : e.g. state1,cn=0
		#
		($unused, $type) = split /=/, $statestr;
			

		#
		# $samplestr NB do we need to allow for / ?
		#
		$iid = $samplestr;
		if ( %famdat ) {
			if ( ! exists $famdat{$iid} ) { die "\nError: no famdat for $iid.\n"; }
			$fid = $famdat{$iid};
		}
		elsif ( $comfam ) {
			$fid = $comfam;
		}
		else {
			$fid = $iid;
		}

		#
		# $confstr : e.g. conf=26.817
		#
		if ( $confstr ) {
			($unused, $score) = split /=/, $confstr;
		}
		else {
			$score = 0;
		}


		print OUT "$fid\t$iid\t$chr\t$bp1\t$bp2\t$type\t$score\t$sites\n";

	}

	close OUT;
	close IN;

}



###################################################
#
# sub read_famfile()
#
# Read in the family information file.
#
# NB assumes each iid belongs to only one fid, i.e.
# iid unique across all samples. As PennCNV output
# doesn't have the concept of families, should be OK.
#
###################################################

sub read_famfile {
	# Read in family file and populate %famdat.
  # two items per line - family id, then iid
  # header is optional

  open ( IN, $famfile ) or die $!;

  while ( my $line = <IN> ) {

	next if ( ( $line =~ /FAM/i ) || ( $line =~ /FID/i ) );
	chomp($line);
	my ( $fid, $iid ) = split /\s+/, $line;
	if ( exists $famdat{$iid} ) {
	  warn "\nWARNING: already have family info for $iid; ignoring new.\n";
	}
	else {
	  $famdat{$iid} = $fid;
	}

  }

  close IN;

}



###################################################
#
# sub process_command_line_arguments()
#
###################################################

sub process_command_line_arguments {

  GetOptions (
			  'input=s'   => \$infile,   # -i. Input file (from PennCNV).
			  'output=s'  => \$outfile,  # -o. File to produce.	
			  'comfam=s'  => \$comfam,   # -c. Common family.
			  'famfile=s' => \$famfile,  # -f. Family info file.
			  'help'      => sub {&usage()},
			 );

  if ( ! $outfile ) {
	print "\nPlease specify output file.\n";
	&usage();
  }

  if ( ! $infile ) {
	print "\nPlease specify input file.\n";
	&usage();
  }

  if ( $comfam && $famfile ) {
	print "\n *** Please specify EITHER a common family OR a family file. ***\n";
	&usage();
  }

}



###################################################
#
# sub usage()
#
###################################################

sub usage {

  my $explain = <<END;

 penncnv_to_plink.pl : convert PennCNV CNV output file to PLINK-format CNV list file.

 Usage:
 perl penncnv_to_plink.pl -i <input> -o <output> [ -f <famfile> | -c <comfam> ] | -h

 where <input>   is input file (i.e. PennCNV "raw" CNV output file)
       <output>  is name of PLINK file to create
       <famfile> [optional] family info file
       <comfam>  [optional] common family id to use for all samples
       -h        displays this message

 It is recommended that the output file ends in ".cnv".

 The PLINK file requires each sample to have an individual id (iid) and a family id
 (fid) such that a sample is identified from the fid-iid combination (i.e. the same
 iid could refer to different people with different fids). However, PennCNV treats
 each sample ID as unique; it does not have the concept of a family ID. By default
 this script just assigns each sample's iid = sample ID and fid = iid, so by
 definition each sample will belong to a unique fid.

 However, this can be overridden by using either of the following options:

 <famfile> name of a file containing family information, fid in col. 1 and iid in
 col. 2. An optional (case-insensitive) header line can be inserted; this must
 start with "fid" or "family".

 If you use the <famfile> option it is your responsibility to ensure that each iid
 belongs to only one fid. A warning will be given if this is not the case.

 <comfam> specifies the common (same) family id to use for all samples in <input>.

END

  print $explain;
  exit();

}

# EOF









