#!/usr/bin/perl use warnings; use strict; use Getopt::Long; use Pod::Usage; our $REVISION = '$Revision: 2abe1a1530386dbdd7ac3f29b59547ff26ba2214 $'; our $DATE = '$Date: 2024-02-19 10:15:59 -0500 (Mon, 19 Feb 2024) $'; our $AUTHOR = '$Author: Kai Wang $'; our ($verbose, $help, $man); our ($dbtype, $dbfile, $outfile, $vcffile, $dbsnpfile, $buildver, $transcriptid, $cdotfile, $afstring, $acstring, $infostring, $twopos, $dbnsfpver); our %iupac = (R=>'AG', Y=>'CT', S=>'GC', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-'); GetOptions('verbose'=>\$verbose, 'help|h'=>\$help, 'man'=>\$man, 'outfile=s'=>\$outfile, 'dbtype=s'=>\$dbtype, 'vcffile=s'=>\$vcffile, 'dbsnpfile=s'=>\$dbsnpfile, 'buildver=s'=>\$buildver, 'transcriptid=s'=>\$transcriptid, 'cdotfile=s'=>\$cdotfile, 'afstring=s'=>\$afstring, 'acstring=s'=>\$acstring, 'infostring=s'=>\$infostring, 'twopos'=>\$twopos, 'dbnsfpver=s'=>\$dbnsfpver) or pod2usage (); $help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT); $man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT); @ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT); @ARGV == 1 or pod2usage ("Syntax error"); ($dbfile) = @ARGV; $dbtype or pod2usage ("Error in argument: please specify --dbtype argument"); $dbtype eq 'cosmic' and $vcffile || pod2usage ("Error in argument: please specify --vcffile argument"); if ($outfile) { open (STDOUT, ">$outfile") or die "Error: cannot write to output file $outfile: $!\n"; } if ($dbtype eq 'clinvar') { prepareClinVar ($dbfile); } elsif ($dbtype eq 'clinvar2') { prepareClinVar2 ($dbfile); } elsif ($dbtype eq 'clinvar_preprocess') { ClinvarPreprocess ($dbfile); } elsif ($dbtype eq 'clinvar_preprocess2') { ClinvarPreprocess2 ($dbfile); } elsif ($dbtype eq 'cosmic') { prepareCosmic ($dbfile, $vcffile); } else { pod2usage ("Error: the -dbtype of $dbtype is not supported yet"); } sub prepareClinVarOld { my ($dbfile) = @_; open (FH, "convert2annovar.pl -format vcf4 -include $dbfile |") or die "Error: cannot read from dbfile $dbfile: $!\n"; if ($outfile) { open (STDOUT, ">$outfile") or die "Error: cannot write to output file: $!\n"; } my %sig = (0 => 'unknown', 1 => 'untested', 2 => 'non-pathogenic', 3 => 'probable-non-pathogenic', 4 => 'probable-pathogenic', 5 => 'pathogenic', 6 => 'drug-response', 7 => 'histocompatibility', 255 => 'other'); while () { m/^#/ and next; s/[\r\n]+$//; my @field = split(/\t/, $_); $field[12]=~m/(CLNDBN=([^;]+);.*CLNACC=([^;]+))/ or die "Error: invalid record found in avinputfile: <$field[12]>"; my $clndbnacc = $1; $field[12]=~m/(CLNDSDB=([^;]+);.*CLNDSDBID=([^;]+))/ or die "Error: invalid record found in avinputfile: <$field[12]>"; my $clndsdb = $1; $field[12]=~m/CLNSIG=([\d\|\,]+)/ or die "Error: invalid record found in avinput file: <$field[12]>"; #CLNSIG=5|5|5,0|0; in RCV000013623.23 my @sigall = split (/\|/, $1); #CLNSIG=5|2|2; my (@clnsig, $clnsig); for my $i (0 .. @sigall-1) { if ($sigall[$i] =~ m/,/) { my @temp = split (/,/, $sigall[$i]); @temp = map {$sig{$_} || 'unknown'} @temp; push @clnsig, join(",", @temp); } else { push @clnsig, $sig{$sigall[$i]} || 'unknown'; } } $clnsig = join ('|', @clnsig); print join ("\t", @field[0..4]), "\t", "CLINSIG=$clnsig;$clndbnacc;$clndsdb\n"; } } sub ClinvarPreprocess { #the goal is to pre-process clinvar VCF (after splitting by VT) and delete unrelevant mutations from the file, before doing left-normalization #prepare_annovar.pl -dbtype clinvar_preprocess ~/temp8 -out ~/temp9 my ($dbfile) = @_; open (FH, $dbfile) or die "Error: cannot read from inputfile $dbfile: $!\n"; if ($outfile) { open (STDOUT, ">$outfile") or die "Error: cannot write to output file: $!\n"; } my ($posindex, $prestring) = (0, ''); while () { if (m/^#/) { #header lines are printed identically print; next; } if (m/(OLD_MULTIALLELIC=.+)$/) { #only if we use vt for splitting multi-allelic variants in VCF files if ($prestring eq $1) { $posindex++; } else { $posindex = 0; #found a new multiallelic variant $prestring = $1; } } else { ($posindex, $prestring) = (0, ''); } my @field = split(/\t/, $_); $field[7]=~m/(CLNDBN=([^;]+);.*CLNACC=([^;]+))/ or die "Error: invalid record found in avinputfile: <$field[7]>"; my ($clndbn, $clnacc) = ($2, $3); $field[7]=~m/(CLNDSDB=([^;]+);.*CLNDSDBID=([^;]+))/ or die "Error: invalid record found in avinputfile: <$field[7]>"; my ($clndsdb, $clndsdbid) = ($2, $3); $field[7]=~m/CLNSIG=([^;]+)/ or die "Error: invalid record found in avinput file: <$field[7]>"; my ($clnsig) = ($1); #the key here is to delete mutations are that are not really in the record itself #an example below that the C allele is actually not in the annotation #7 150648198 rs1137617 A C,G,T . . RS=1137617;RSPOS=150648198;RV;dbSNPBuildID=86;SSR=0;SAO=1;VP=0x05017800030515053e110100;GENEINFO=KCNH2:3757;WGT=1;VC=SNV;PM;TPA;PMC;SLO;REF;SYN;ASP;VLD;G5;HD;GNO;KGPhase1;KGPhase3;LSD;OM;CLNALLE=2,3;CLNHGVS=NC_000007.13:g.150648198A>G,NC_000007.13:g.150648198A>T;CLNSRC=.,.;CLNORIGIN=1,1;CLNSRCID=.,.;CLNSIG=2|2|3,5;CLNDSDB=MedGen|MedGen|MeSH:MedGen:SNOMED_CT,MedGen;CLNDSDBID=CN169374|CN230736|D008133:C0023976:9651007,CN221809;CLNDBN=not_specified|Cardiovascular_phenotype|Long_QT_syndrome,not_provided;CLNREVSTAT=mult|single|single,single;CLNACC=RCV000181727.2|RCV000253499.1|RCV000283094.1,RCV000181832.1;CAF=0.2278,.,0.7722,.;COMMON=1 if ($prestring) { #if there is a multi-allelic variant $field[7] =~ m/CLNHGVS=([^;]+);/ or die "Error: invalid record found in avinput file: <$field[7]>"; my @hgvs = split (/,/, $1); #defined $hgvs[$posindex] or warn "posindex ($posindex) not found in $field[12] (@field[0..4])" and next; #something is wrong defined $hgvs[$posindex] or next; if ($hgvs[$posindex] =~ m/(?:g|m)\.\d+(\w+)>(\w+)/) { my ($nt1, $nt2) = ($1, $2); if ($nt2 ne $field[4]) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+(\w+)\\x3d/) { #RS=1042714 CLNHGVS=NC_000005.9:g.148206473G\x3d 1; #nothing needs to be done, assume that the first occurence is the one that we are interested in } elsif ($hgvs[$posindex] =~ m/(?:g|m)\.\d+dup(\w)$/) { #RS=142323886 CLNHGVS=NC_000012.11:g.25359046_25359047dupAA,NC_000012.11:g.25359046dupA if (not $field[4] =~ m/\w$1$/) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/(?:g|m)\.\d+_\d+dup(\w+)$/) { #RS=113564356 CLNHGVS=NC_000003.11:g.150645419_150645420dupAC,NC_000003.11:g.150645419_150645422dupACAC if (not $field[4] =~ m/^\w$1$/) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.(\d+)_(\d+)dup$/) { #RS=113564356 CLNHGVS=NC_000003.11:g.150645419_150645420dupAC,NC_000003.11:g.150645419_150645422dupACAC if (length($field[4])-length($field[3]) != $2-$1+1) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+dup$/) { #RS=113564356 CLNHGVS=NC_000003.11:g.150645419_150645420dupAC,NC_000003.11:g.150645419_150645422dupACAC if (length($field[4])-length($field[3]) != 1) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+_\d+ins(\w+)$/) { #RS=779985493 CLNHGVS=NC_000001.10:g.241663898_241663899insGAGAGA if ($field[3].$1 ne $field[4]) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+_\d+del(\d+)$/) { #RS=267608120 CLNHGVS=NC_000002.11:g.48033292_48033308del17 if (length($field[3]) - length($field[4]) != $1) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+_\d+del(\w+)$/) { #RS=267608120 CLNHGVS=NC_000002.11:g.48033748_48033751delCAAG if (not ($field[3] =~ m/\w$1$/ and $field[4] =~ m/^\w$/)) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+_\d+del(\w+)ins(\w+)$/) { #RS=794728490 CLNHGVS=NC_000007.13:g.150648198_150648200delATAinsGTG if (not ($field[3] =~ m/\w$1$/ and $field[4] =~ m/^\w$2$/)) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+del(\w)/) { #RS=267608120 CLNHGVS=NC_000002.11:g.48033748_48033751delCAAG if (not ($field[3] =~ m/\w$1$/ and $field[4] =~ m/^\w$/)) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+_\d+inv(\w+)/) { #RS=879353161 CLNHGVS=NC_000010.10:g.73115941_73115942invTG if (not $field[3] =~ m/\w$1$/) { $posindex--; next; } } else { warn "WARNING: non-canonical pattern found in HGVS: $field[7]>"; } my @temp; @temp = split (/,/, $clnsig); $clnsig = $temp[$posindex]; @temp = split (/,/, $clndbn); $clndbn = $temp[$posindex]; @temp = split (/,/, $clndsdb); $clndsdb = $temp[$posindex]; @temp = split (/,/, $clndsdbid); $clndsdbid = $temp[$posindex]; @temp = split (/,/, $clnacc); $clnacc = $temp[$posindex]; $field[7] =~ s/CLNDBN=([^;]+)/CLNDBN=$clndbn/; $field[7] =~ s/CLNACC=([^;]+)/CLNACC=$clnacc/; $field[7] =~ s/CLNDSDB=([^;]+)/CLNDSDB=$clndsdb/; $field[7] =~ s/CLNDSDBID=([^;]+)/CLNDSDBID=$clndsdbid/; $field[7] =~ s/CLNSIG=([^;]+)/CLNSIG=$clnsig/; $field[7] =~ s/CLNHGVS=([^;]+)/CLNHGVS=$hgvs[$posindex]/; } print STDOUT join("\t", @field); } } sub ClinvarPreprocess2 { #the goal is to pre-process clinvar VCF (after splitting by VT) and delete unrelevant mutations from the file, before doing left-normalization #prepare_annovar.pl -dbtype clinvar_preprocess ~/temp8 -out ~/temp9 my ($dbfile) = @_; open (FH, $dbfile) or die "Error: cannot read from inputfile $dbfile: $!\n"; if ($outfile) { open (STDOUT, ">$outfile") or die "Error: cannot write to output file: $!\n"; } my ($posindex, $prestring) = (0, ''); while () { if (m/^#/) { #header lines are printed identically print; next; } if (m/(OLD_MULTIALLELIC=.+)$/) { #only if we use vt for splitting multi-allelic variants in VCF files if ($prestring eq $1) { $posindex++; } else { $posindex = 0; #found a new multiallelic variant $prestring = $1; } } else { ($posindex, $prestring) = (0, ''); } ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= my @field = split(/\t/, $_); $field[7]=~m/CLNDNINCL/ and next; #this is for included variant #Interpretations may be made on a single variant or a set of variants, such as a haplotype. Variants #that have only been interpreted as part of a set of variants (i.e. no direct interpretation for the #variant itself) are considered "included" variants. The VCF files include both variants with a direct #interpretation and included variants. Included variants do not have an associated disease (CLNDN, #CLNDISDB) or a clinical significance (CLNSIG). Instead there are three tags are specific to the #included variants - CLNDNINCL, CLNDISDBINCL, and CLNSIGINCL (see below). $field[7]=~m/ALLELEID=([^;]+)/ or die "Error: invalid record found in avinputfile: <$field[7]>"; my $alleleid = $1; $field[7]=~m/CLNDN=([^;]+)/ or die "Error: invalid record found in avinputfile: <$field[7]>"; my $clndn = $1; $field[7]=~m/CLNDISDB=([^;]+)/ or die "Error: invalid record found in avinputfile: <$field[7]>"; my $clndisdb = $1; $field[7]=~m/CLNREVSTAT=([^;]+)/ or die "Error: invalid record found in avinputfile: <$field[7]>"; my $clnrevstat = $1; $field[7]=~m/CLNSIG=([^;]+)/ or die "Error: invalid record found in avinputfile: <$field[7]>"; my $clnsig = $1; #the key here is to delete mutations are that are not really in the record itself #an example below that the C allele is actually not in the annotation #7 150648198 rs1137617 A C,G,T . . RS=1137617;RSPOS=150648198;RV;dbSNPBuildID=86;SSR=0;SAO=1;VP=0x05017800030515053e110100;GENEINFO=KCNH2:3757;WGT=1;VC=SNV;PM;TPA;PMC;SLO;REF;SYN;ASP;VLD;G5;HD;GNO;KGPhase1;KGPhase3;LSD;OM;CLNALLE=2,3;CLNHGVS=NC_000007.13:g.150648198A>G,NC_000007.13:g.150648198A>T;CLNSRC=.,.;CLNORIGIN=1,1;CLNSRCID=.,.;CLNSIG=2|2|3,5;CLNDSDB=MedGen|MedGen|MeSH:MedGen:SNOMED_CT,MedGen;CLNDSDBID=CN169374|CN230736|D008133:C0023976:9651007,CN221809;CLNDBN=not_specified|Cardiovascular_phenotype|Long_QT_syndrome,not_provided;CLNREVSTAT=mult|single|single,single;CLNACC=RCV000181727.2|RCV000253499.1|RCV000283094.1,RCV000181832.1;CAF=0.2278,.,0.7722,.;COMMON=1 if ($prestring) { #if there is a multi-allelic variant $field[7] =~ m/CLNHGVS=([^;]+);/ or die "Error: invalid record found in avinput file: <$field[7]>"; my @hgvs = split (/,/, $1); #defined $hgvs[$posindex] or warn "posindex ($posindex) not found in $field[12] (@field[0..4])" and next; #something is wrong defined $hgvs[$posindex] or next; if ($hgvs[$posindex] =~ m/(?:g|m)\.\d+(\w+)>(\w+)/) { my ($nt1, $nt2) = ($1, $2); if ($nt2 ne $field[4]) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+(\w+)\\x3d/) { #RS=1042714 CLNHGVS=NC_000005.9:g.148206473G\x3d 1; #nothing needs to be done, assume that the first occurence is the one that we are interested in } elsif ($hgvs[$posindex] =~ m/(?:g|m)\.\d+dup(\w)$/) { #RS=142323886 CLNHGVS=NC_000012.11:g.25359046_25359047dupAA,NC_000012.11:g.25359046dupA if (not $field[4] =~ m/\w$1$/) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/(?:g|m)\.\d+_\d+dup(\w+)$/) { #RS=113564356 CLNHGVS=NC_000003.11:g.150645419_150645420dupAC,NC_000003.11:g.150645419_150645422dupACAC if (not $field[4] =~ m/^\w$1$/) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.(\d+)_(\d+)dup$/) { #RS=113564356 CLNHGVS=NC_000003.11:g.150645419_150645420dupAC,NC_000003.11:g.150645419_150645422dupACAC if (length($field[4])-length($field[3]) != $2-$1+1) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+dup$/) { #RS=113564356 CLNHGVS=NC_000003.11:g.150645419_150645420dupAC,NC_000003.11:g.150645419_150645422dupACAC if (length($field[4])-length($field[3]) != 1) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+_\d+ins(\w+)$/) { #RS=779985493 CLNHGVS=NC_000001.10:g.241663898_241663899insGAGAGA if ($field[3].$1 ne $field[4]) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+_\d+del(\d+)$/) { #RS=267608120 CLNHGVS=NC_000002.11:g.48033292_48033308del17 if (length($field[3]) - length($field[4]) != $1) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+_\d+del(\w+)$/) { #RS=267608120 CLNHGVS=NC_000002.11:g.48033748_48033751delCAAG if (not ($field[3] =~ m/\w$1$/ and $field[4] =~ m/^\w$/)) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+_\d+del(\w+)ins(\w+)$/) { #RS=794728490 CLNHGVS=NC_000007.13:g.150648198_150648200delATAinsGTG if (not ($field[3] =~ m/\w$1$/ and $field[4] =~ m/^\w$2$/)) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+del(\w)/) { #RS=267608120 CLNHGVS=NC_000002.11:g.48033748_48033751delCAAG if (not ($field[3] =~ m/\w$1$/ and $field[4] =~ m/^\w$/)) { $posindex--; next; } } elsif ($hgvs[$posindex] =~ m/g\.\d+_\d+inv(\w+)/) { #RS=879353161 CLNHGVS=NC_000010.10:g.73115941_73115942invTG if (not $field[3] =~ m/\w$1$/) { $posindex--; next; } } else { warn "WARNING: non-canonical pattern found in HGVS: $field[7]>"; } my @temp; @temp = split (/,/, $alleleid); $alleleid = $temp[$posindex]; @temp = split (/,/, $clndn); $clndn = $temp[$posindex]; @temp = split (/,/, $clndisdb); $clndisdb = $temp[$posindex]; @temp = split (/,/, $clnrevstat); $clnrevstat = $temp[$posindex]; @temp = split (/,/, $clnsig); $clnsig = $temp[$posindex]; $field[7] =~ s/ALLELEID=([^;]+)/ALLELEID=$alleleid/; $field[7] =~ s/CLNDN=([^;]+)/CLNDN=$clndn/; $field[7] =~ s/CLNDISDB=([^;]+)/CLNDISDB=$clndisdb/; $field[7] =~ s/CLNREVSTAT=([^;]+)/CLNREVSTAT=$clnrevstat/; $field[7] =~ s/CLNSIG=([^;]+)/CLNSIG=$clnsig/; $field[7] =~ s/CLNHGVS=([^;]+)/CLNHGVS=$hgvs[$posindex]/; } print STDOUT join("\t", @field); } } sub prepareClinVar { my ($dbfile) = @_; open (FH, "convert2annovar.pl -format vcf4 -include $dbfile |") or die "Error: cannot read from dbfile $dbfile: $!\n"; if ($outfile) { open (STDOUT, ">$outfile") or die "Error: cannot write to output file: $!\n"; } #my %sig = (0 => 'unknown', 1 => 'untested', 2 => 'non-pathogenic', 3 => 'probable-non-pathogenic', 4 => 'probable-pathogenic', 5 => 'pathogenic', 6 => 'drug-response', 7 => 'histocompatibility', 255 => 'other'); my %sig = (0 => 'Uncertain significance', 1 => 'not provided', 2 => 'Benign', 3 => 'Likely benign', 4 => 'Likely pathogenic', 5 => 'Pathogenic', 6 => 'drug response', 7 => 'histocompatibility', 255 => 'other'); while () { m/^#/ and next; s/[\r\n]+$//; my @field = split(/\t/, $_); $field[12]=~m/(CLNDBN=([^;]+);.*CLNACC=([^;]+))/ or die "Error: invalid record found in avinputfile: <$field[12]>"; my ($clndbn, $clnacc) = ($2, $3); $field[12]=~m/(CLNDSDB=([^;]+);.*CLNDSDBID=([^;]+))/ or die "Error: invalid record found in avinputfile: <$field[12]>"; my ($clndsdb, $clndsdbid) = ($2, $3); #the goal below is to split both "|" and ",". I should first split on "," before splitting on "|", but did not in an opposite way. It does not affect results though $field[12]=~m/CLNSIG=([\d\|\,]+)/ or die "Error: invalid record found in avinput file: <$field[12]>"; #CLNSIG=5|5|5,0|0; in RCV000013623.23 my @sigall = split (/\|/, $1); #CLNSIG=5|2|2; my (@clnsig, $clnsig); for my $i (0 .. @sigall-1) { if ($sigall[$i] =~ m/,/) { my @temp = split (/,/, $sigall[$i]); @temp = map {$sig{$_} || 'unknown'} @temp; push @clnsig, join(",", @temp); } else { push @clnsig, $sig{$sigall[$i]} || 'unknown'; } } $clnsig = join ('|', @clnsig); $clnsig =~ s/,/\\x2c/g; $clndbn =~ s/,/\\x2c/g; $clnacc =~ s/,/\\x2c/g; $clndsdb =~ s/,/\\x2c/g; $clndsdbid =~ s/,/\\x2c/g; print join ("\t", @field[0..4], $clnsig, $clndbn, $clnacc, $clndsdb, $clndsdbid), "\n"; } } sub prepareClinVar2 { my ($dbfile) = @_; open (FH, "convert2annovar.pl -format vcf4 -include $dbfile |") or die "Error: cannot read from dbfile $dbfile: $!\n"; if ($outfile) { open (STDOUT, ">$outfile") or die "Error: cannot write to output file: $!\n"; } ###INFO= ###INFO= ###INFO= ###INFO= ###INFO= ###INFO= ###INFO= ###INFO= # while () { m/^#/ and next; s/[\r\n]+$//; my @field = split(/\t/, $_); $field[12]=~m/ALLELEID=([^;]+)/ or die "Error: invalid record found in avinputfile: <$field[12]>"; my $alleleid = $1; my ($clndn, $oncdn, $scidn) = qw/. . ./; if ($field[12]=~m/CLNDN=([^;]+)/) { $clndn = $1; } if ($field[12]=~m/ONCDN=([^;]+)/) { $oncdn = $1; } if ($field[12]=~m/SCIDN=([^;]+)/) { $scidn = $1; } my ($clndisdb, $oncdisdb, $scidisdb) = qw/. . ./; #$field[12]=~m/(?:CLN|ONC|SCI)DISDB=([^;]+)/ or die "Error: invalid record found in avinputfile: <$field[12]>"; $field[12] =~ m/CLNDISDB=([^;]+)/ and $clndisdb = $1; $field[12] =~ m/ONCDISDB=([^;]+)/ and $oncdisdb = $1; $field[12] =~ m/SCIDISDB=([^;]+)/ and $scidisdb = $1; #Sometimes Clinvar does not provide this record, for example, ALLELEID=981330 in ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar_20210123.vcf.gz file my ($clnrevstat, $oncrevstat, $scirevstat) = qw/. . ./; #$field[12] =~ m/(CLN|ONC|SCI)REVSTAT=/ or die "Error: invalid record found in avinputfile: <$field[12]>"; $field[12]=~m/CLNREVSTAT=([^;]+)/ and $clnrevstat = $1; $field[12]=~m/ONCREVSTAT=([^;]+)/ and $oncrevstat = $1; $field[12]=~m/SCIREVSTAT=([^;]+)/ and $scirevstat = $1; my ($clnsig, $onc, $sci) = qw/. . ./; #$field[12] =~ m/(CLNSIG|ONC|SCI)=/ or die "Error: invalid record found in avinputfile: <$field[12]>"; $field[12]=~m/CLNSIG=([^;]+)/ and $clnsig = $1; $field[12]=~m/ONC=([^;]+)/ and $onc = $1; $field[12]=~m/SCI=([^;]+)/ and $sci = $1; if ($clnsig eq '.' and $onc eq '.' and $sci eq '.') { $field[12]=~m/(CLNDN|ONC|SCI)INCL/ or warn "WARNING: skipping invalid record found in avinputfile: <$field[12]>"; #20220329: ALLELEID=1324182;CLNDISDB=MONDO:MONDO:0009335,MedGen:C2749604,OMIM:235400 next; } $alleleid =~ s/,/\\x2c/g; $clndn =~ s/,/\\x2c/g; $oncdn =~ s/,/\\x2c/g; $scidn =~ s/,/\\x2c/g; $clndisdb =~ s/,/\\x2c/g; $oncdisdb =~ s/,/\\x2c/g; $scidisdb =~ s/,/\\x2c/g; $clnrevstat =~ s/,/\\x2c/g; $oncrevstat =~ s/,/\\x2c/g; $scirevstat =~ s/,/\\x2c/g; $clnsig =~ s/,/\\x2c/g; $onc =~ s/,/\\x2c/g; $sci =~ s/,/\\x2c/g; print join ("\t", @field[0..4], $alleleid, $clndn, $clndisdb, $clnrevstat, $clnsig, $oncdn, $oncdisdb, $oncrevstat, $onc, $scidn, $scidisdb, $scirevstat, $sci), "\n"; } } sub prepareCosmic { my ($dbfile, $vcffile) = @_; my %mut; #key=mutation ID value=chr+pos+ref+alt my (%cosmic, %cosmicid, %nonfound); my ($idprefix); if ($vcffile =~ m/\.vcf\.gz$/) { open (VCF, "gunzip -c $vcffile |") or die "Error: cannot read from gunzip: $!\n"; } else { open (VCF, $vcffile) or die "Error: cannot read from VCF file: $!\n"; } $_ = ; m/^##fileformat=VCFv4/ or die "Error: the supplied VCF file does not have valid version 4 header\n"; while () { m/^#/ and next; my @field = split (/\t/, $_); #following paragraph is no longer necessary with new version of COSMIC #if ($field[2] =~ s/^COSM//) { #for coding variants # $idprefix = "COSM"; #} elsif ($field[2] =~ s/^COSN//) { #for non-coding variants # $idprefix = "COSN"; #} else { # die "Error: invalid record found in VCF file: ID does not start with COSM or COSN: <$_>\n"; #} my ($chr, $start, $id, $ref_allele, $mut_allele) = @field[0..4]; my $end = $start + length($ref_allele) - 1; $ref_allele =~ m/[^ACGTacgt]/ and next; $mut_allele =~ m/[^ACGTacgt]/ and next; if (length ($ref_allele) > 1 or length ($mut_allele) > 1) { if(length($ref_allele) > length ($mut_allele)) { # deletion or block substitution my $head = substr($ref_allele, 0, length ($mut_allele)); if ($head eq $mut_allele) { $start = $start+length($head); #then change start position $ref_allele = substr ($ref_allele, length ($mut_allele)); $mut_allele = "-"; } elsif (substr ($ref_allele, 0, 1) eq substr ($mut_allele, 0, 1)) { #first base is identical (this is used in many VCF files) $start++; $ref_allele = substr ($ref_allele, 1); $mut_allele = substr ($mut_allele, 1); } } elsif(length($mut_allele) >= length ($ref_allele)) { # insertion or block substitution my $head = substr ($mut_allele, 0, length ($ref_allele)); if ($head eq $ref_allele) { $start = $start+length($ref_allele)-1; $mut_allele = substr ($mut_allele, length ($ref_allele)); $ref_allele = '-'; } elsif (substr ($ref_allele, 0, 1) eq substr ($mut_allele, 0, 1)) { #first base is identical (this is used in many VCF files) $start++; $ref_allele = substr ($ref_allele, 1); $mut_allele = substr ($mut_allele, 1); } } } $mut{$id} = join ("\t", $chr, $start, $end, $ref_allele, $mut_allele); } print STDERR "NOTICE: Finished reading ", scalar (keys %mut), " mutation ID from the VCF file $vcffile\n"; open (DB, $dbfile) or die "Error: cannot read from DB file: $!\n"; $_ = ; my @field = split (/\t/, $_); my $headerline = $_; my ($mutid, $filetype); my ($mutid_index, $id_tumor_index, $primary_site_index, $phenotype_index, $sample_index); if ($field[16] eq 'Mutation ID' or $field[16] eq 'GENOMIC_MUTATION_ID') { #'Mutation ID' is for v89 and before, 'GENOMIC_MUTATION_ID' is for v90 $mutid_index = 16; $mutid = $field[$mutid_index]; $filetype = 'coding'; } elsif ($field[17] eq 'GENOMIC_MUTATION_ID') { #'GENOMIC_MUTATION_ID' is for v91, v92, etc in the 18th column (one extra column called 'MUTATION_ID' is added before this column), the ID_NCV is no longer in the noncoding file either $mutid_index = 17; $mutid = $field[$mutid_index]; $filetype = 'coding'; } elsif ($field[11] eq 'ID_NCV' or $field[11] eq 'GENOMIC_MUTATION_ID') { #'ID_NCV' is for v89 and before, 'GENOMIC_MUTATION_ID' is for v90 $mutid_index = 11; $mutid = $field[$mutid_index]; $filetype = 'noncoding'; } elsif ($headerline =~ m/\bGENOMIC_MUTATION_ID\b/) { for my $i (0 .. @field-1) { if ($field[$i] eq 'GENOMIC_MUTATION_ID') { $mutid_index = $i; } elsif ($field[$i] eq 'ID_tumour') { $id_tumor_index = $i; } elsif ($field[$i] eq 'Primary site') { $primary_site_index = $i; } elsif ($field[$i] eq 'COSMIC_PHENOTYPE_ID') { $phenotype_index = $i; } elsif ($field[$i] eq 'COSMIC_SAMPLE_ID') { $sample_index = $i; } } $filetype = 'unknown'; } else { die "Error: COSMIC MutantExport format error: column 17 or 18 or 12 should be 'Mutation ID' or 'GENOMIC_MUTATAION_ID' or 'ID_NCV', or at least 'GENOMIC_MUTATAION_ID' should be present in the header line\n"; } defined $mutid_index or die "Error: Unable to locate Mutation ID column from the header line of the input file $dbfile\n"; $filetype eq 'unknown' and print STDERR "NOTICE: unknown file type with mutid_index=$mutid_index, phenotype_index=$phenotype_index, sample_index=$sample_index\n"; while () { my ($id_tumor, $primary_site); @field = split (/\t/, $_); if ($filetype eq 'coding') { $mutid = $field[$mutid_index]; #it could be 16 or 17 $id_tumor = $field[6]; $primary_site = $field[7]; } elsif ($filetype eq 'noncoding') { $mutid = $field[$mutid_index]; #it is 11 but may change to a different number in the future $id_tumor = $field[2]; $primary_site = $field[3]; } elsif ($filetype eq 'unknown') { #like cosmic99 $mutid = $field[$mutid_index]; if (defined $id_tumor_index) { $id_tumor = $field[$id_tumor_index]; } elsif (defined $sample_index) { $id_tumor = $field[$sample_index]; } if (defined $primary_site_index) { $primary_site = $field[$primary_site_index]; } elsif (defined $phenotype_index) { $primary_site = $field[$phenotype_index]; } } my $chrstring = $mut{$mutid}; if (not defined $chrstring) { $nonfound{$mutid}++; next; } if ($id_tumor) { #does not exist in cosmic99 but we are using sample_index and phenotype_index instead $cosmic{$chrstring} .= ";$id_tumor,$primary_site"; #these two columns are "ID_tumour" and "Primary site" } #$cosmicid{$chrstring} .= ",$idprefix$field[16]"; #this is no longer necessary with new version of COSMIC $cosmicid{$chrstring} .= ",$mutid"; } print STDERR "NOTICE: Finished reading ", scalar (keys %cosmicid), " COSMIC records in DB file $dbfile\n"; print STDERR "WARNING: ", scalar (keys %nonfound), " COSMIC ID from MutantExport file cannot be found in VCF file (this may be normal if the VCF file only contains coding or noncoding variants\n"; #the following is to eliminate duplicate entries. For example, COSM256593 and COSM256594 refers to the same mutation in the same sample, but are annotated twice, once with CDK11B/NM_001787 and once with CDC2L2/ENST00000357760 for my $key (keys %cosmicid) { if ($cosmic{$key}) { my @id_site = split (/;/, $cosmic{$key}); shift @id_site; my (%found_tumorid, %found_site); for my $i (0 .. @id_site-1) { my ($tumorid, $site) = split (/,/, $id_site[$i]); $found_tumorid{$tumorid} and next; #the idea is that for duplicate entries, the tumor ID must be identical $found_tumorid{$tumorid}++; $found_site{$site}++; } my @cosmicid; @cosmicid = split (/,/, $cosmicid{$key}); shift @cosmicid; my %cosmicid = map {$_, 1} @cosmicid; print $key, "\t", "ID=", join (",", keys %cosmicid), ";", "OCCURENCE="; my $occurence; for my $site (keys %found_site) { $occurence .= "," . $found_site{$site} . "(" . $site . ")"; } $occurence =~ s/^,//; print "$occurence\n"; } else { #for future cosmic version, if sample/phenotype information is not available, then we only print out cosmic genomic_mutation_id my @newcosmicid = split (/,/, $cosmicid{$key}); shift @newcosmicid; my %newcosmicid = map {$_, 1} @newcosmicid; print $key, "\t", "ID=", join (",", keys %newcosmicid), "\n"; } } } sub sum { my $sum = 0; for (@_) { $sum += $_; } return $sum; } sub revcom { my ($seq) = @_; $seq = reverse $seq; $seq =~ tr/acgtACGT/tgcaTGCA/; return ($seq); } =head1 SYNOPSIS prepare_annovar_user.pl [arguments] Optional arguments: -h, --help print help message -m, --man print complete documentation -v, --verbose use verbose output --outfile output file name --dbtype specify database type (cosmic, clinvar, clinvar2) Function: reformat and prepare ANNOVAR annotation database Example: # For preparing COSMIC database for annotation in ANNOVAR prepare_annovar_user.pl -dbtype cosmic CosmicMutantExport.tsv -vcf CosmicCodingMuts.vcf > hg38_cosmic76.txt prepare_annovar_user.pl -dbtype cosmic CosmicMutantExport.tsv -vcf CosmicNonCodingVariants.vcf >> hg38_cosmic76.txt prepare_annovar_user.pl -dbtype cosmic CosmicNCV.tsv -vcf CosmicNonCodingVariants.vcf.gz > hg38_cosmic90_noncoding.txt prepare_annovar_user.pl -dbtype cosmic CosmicMutantExport.tsv -vcf CosmicCodingMuts.vcf.gz >> hg38_cosmic90_coding.txt # For preparing CLINVAR database for annotation in ANNOVAR wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20180603.vcf.gz wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20180603.vcf.gz.tbi vt decompose clinvar_20180603.vcf.gz -o temp.split.vcf prepare_annovar_user.pl -dbtype clinvar_preprocess2 temp.split.vcf -out temp.split2.vcf vt normalize temp.split2.vcf -r ~/project/seqlib/GRCh38/old/GRCh38.fa -o temp.norm.vcf -w 2000000 prepare_annovar_user.pl -dbtype clinvar2 temp.norm.vcf -out hg38_clinvar_20180603_raw.txt index_annovar.pl hg38_clinvar_20180603_raw.txt -out hg38_clinvar_20180603.txt -comment comment_20180708.txt Version: $Date: 2024-02-19 10:15:59 -0500 (Mon, 19 Feb 2024) $ =head1 OPTIONS =over 8 =item B<--help> print a brief usage message and detailed explanation of options. =item B<--man> print the complete manual of the program. =item B<--verbose> use verbose output. =item B<--outfile> specify the output file name. By default, output is written to STDOUT. =item B<--dbtype> specify database type, such as esp, cosmicgff, hgmdgff, etc =back =head1 DESCRIPTION This program is used to convert various databases to a format that can be used by ANNOVAR. For questions or comments, please contact kai@openbioinformatics.org. =cut