#!/usr/bin/perl # # This software has been created by Genome Research Limited (GRL). # GRL hereby grants permission to use, copy, modify and distribute # this software and its documentation for non-commercial purposes # without fee at the user's own risk on the basis set out below. GRL # neither undertakes nor accepts any duty whether contractual or # otherwise in connection with the software, its use or the use # of any derivative, and makes no representations or warranties, # express or implied, concerning the software, its suitability, # fitness for a particular purpose or non-infringement. In no event # shall the authors of the software or GRL be responsible or liable # for any loss or damage whatsoever arising in any way directly or # indirectly out of the use of this software or its derivatives, # even if advised of the possibility of such damage. Our software # can be freely distributed under the conditions set out above, # and must contain this copyright notice. # # penncnv_to_plink.pl : convert PennCNV output file to # PLINK-format CNV list file. # # See http://pngu.mgh.harvard.edu/~purcell/plink/cnv.shtml # # $Id$ # # Matthew Gillman, WTSI, 28th June 2010 # # PennCNV files have the form: # chr3:192548086-192552678 numsnp=8 length=4,593 state1,cn=0 # sample001.txt startsnp=rs6444540 endsnp=rs297396 conf=26.817 # # PLINK CNV list file has: # FID Family ID # Need both FID and IID to # IID Individual ID # uniquely specify an individual. # CHR Chromosome # BP1 Start position (base-pair) # BP2 End position (base-pair) # TYPE Type of variant, e.g. 0,1 or 3,4 copies # SCORE Confidence score associated with variant # SITES Number of probes in the variant # use strict; use warnings; use Getopt::Long; # # Options for specifying families: # 1. Do not specify; set each sample's family ID to sample ID. # 2. Set all samples to the same (common) family ID, $comfam. # 3. Supply a family file ($famfile) which gives each sample's family. # my ($infile, $outfile, $famfile, $comfam); my %famdat = (); # If used: K = individual ID (sample ID); V = its family ID. &process_command_line_arguments(); if ( $famfile ) { &read_famfile(); } &create(); ################################################### # # sub create() # # Read in the PennCNV file and create the PLINK file. # ################################################### sub create { open ( IN, $infile ) or die $!; open ( OUT, "> $outfile" ) or die $!; print OUT "FID\tIID\tCHR\tBP1\tBP2\tTYPE\tSCORE\tSITES\n"; my ($fid, $iid, $chr, $bp1, $bp2, $type, $score, $sites); while ( my $line = ) { chomp($line); my ($posstr, $numprobestr, $lengthstr, $statestr, $samplestr, $startstr, $endstr, $confstr); my $unused; my @junk = (); # These lines assume the input file is correctly formatted. # Let's hope they are. if ( $line =~ /conf/ ) { ($posstr, $numprobestr, $lengthstr, $statestr, $samplestr, $startstr, $endstr, $confstr, @junk) = split /\s+/, $line, 9; } else { ($posstr, $numprobestr, $lengthstr, $statestr, $samplestr, $startstr, $endstr, @junk) = split /\s+/, $line, 8; } # # $posstr : e.g. chr1:147305744-147427061 # my @posdat = split /:/, $posstr; my $chrdat = $posdat[0]; # e.g. "chr1" my @chars = split (//, $chrdat); for ( my $s = 1; $s <= 3; $s++ ) { shift(@chars); } $chr = join("", @chars); # if ( $chr eq "X" ) { $chr = 23; } # elsif ( $chr eq "Y" ) { $chr = 24; } # elsif ( $chr eq "XY" ) { $chr = 25; } # elsif ( $chr eq "MT" ) { $chr = 26; } my $bpdat = $posdat[1]; ($bp1, $bp2) = split /-/, $bpdat; # # $numprobestr : e.g. numsnp=8 # ($unused, $sites) = split /=/, $numprobestr; # # $statestr : e.g. state1,cn=0 # ($unused, $type) = split /=/, $statestr; # # $samplestr NB do we need to allow for / ? # $iid = $samplestr; if ( %famdat ) { if ( ! exists $famdat{$iid} ) { die "\nError: no famdat for $iid.\n"; } $fid = $famdat{$iid}; } elsif ( $comfam ) { $fid = $comfam; } else { $fid = $iid; } # # $confstr : e.g. conf=26.817 # if ( $confstr ) { ($unused, $score) = split /=/, $confstr; } else { $score = 0; } print OUT "$fid\t$iid\t$chr\t$bp1\t$bp2\t$type\t$score\t$sites\n"; } close OUT; close IN; } ################################################### # # sub read_famfile() # # Read in the family information file. # # NB assumes each iid belongs to only one fid, i.e. # iid unique across all samples. As PennCNV output # doesn't have the concept of families, should be OK. # ################################################### sub read_famfile { # Read in family file and populate %famdat. # two items per line - family id, then iid # header is optional open ( IN, $famfile ) or die $!; while ( my $line = ) { next if ( ( $line =~ /FAM/i ) || ( $line =~ /FID/i ) ); chomp($line); my ( $fid, $iid ) = split /\s+/, $line; if ( exists $famdat{$iid} ) { warn "\nWARNING: already have family info for $iid; ignoring new.\n"; } else { $famdat{$iid} = $fid; } } close IN; } ################################################### # # sub process_command_line_arguments() # ################################################### sub process_command_line_arguments { GetOptions ( 'input=s' => \$infile, # -i. Input file (from PennCNV). 'output=s' => \$outfile, # -o. File to produce. 'comfam=s' => \$comfam, # -c. Common family. 'famfile=s' => \$famfile, # -f. Family info file. 'help' => sub {&usage()}, ); if ( ! $outfile ) { print "\nPlease specify output file.\n"; &usage(); } if ( ! $infile ) { print "\nPlease specify input file.\n"; &usage(); } if ( $comfam && $famfile ) { print "\n *** Please specify EITHER a common family OR a family file. ***\n"; &usage(); } } ################################################### # # sub usage() # ################################################### sub usage { my $explain = < -o [ -f | -c ] | -h where is input file (i.e. PennCNV "raw" CNV output file) is name of PLINK file to create [optional] family info file [optional] common family id to use for all samples -h displays this message It is recommended that the output file ends in ".cnv". The PLINK file requires each sample to have an individual id (iid) and a family id (fid) such that a sample is identified from the fid-iid combination (i.e. the same iid could refer to different people with different fids). However, PennCNV treats each sample ID as unique; it does not have the concept of a family ID. By default this script just assigns each sample's iid = sample ID and fid = iid, so by definition each sample will belong to a unique fid. However, this can be overridden by using either of the following options: name of a file containing family information, fid in col. 1 and iid in col. 2. An optional (case-insensitive) header line can be inserted; this must start with "fid" or "family". If you use the option it is your responsibility to ensure that each iid belongs to only one fid. A warning will be given if this is not the case. specifies the common (same) family id to use for all samples in . END print $explain; exit(); } # EOF