#! /projects/rcsb/dev/usr/wbluhm/rcsb/local/bin/perl # Includes SwissProt and Trembl =head1 USAGE Sample command line: % ( pdb2sptr_cons20010228.pl pdb2sp_sources20010228.txt ) > & pdb2sptr_err (The program name may include a date) This script generates a consensus of SwissProt related information for PDB chain ids, using various sources, including the following files (or dated versions of them), as listed in the file pointed to by $ARGV[0]): pdb2sp.txt pdbids to spids, derived from SwissProt DR records pdb2spMIA.txt pdbids to spids, derived from MIA pdb2spDBREF.txt chain ids to spids, derived from PDB DBREF records pdb2spMAN.txt chain ids to spids, manually derived for non-consensus cases pdb2spBLAST.txt chain ids to spids, manually derived from Blast reports blast_dupl.txt chain ids to chain ids, duplicate sequences pdb2spCORR.txt chain ids to spids, manually entered corrections sp2names.txt spids to names and synonyms extracted from SP DE records sp2species.txt species information from SP OS record sp2kw.txt keywords (classifiers) from SP KW record The script generates several pieces of output: 1) The main output file ('pdb2sp_out') with the following information: PDBID:Chain"ID"SPIDalignmentname(synonym)* PDBID:Chain"SRC"(latin source term)(common source term) PDBID:Chain"KW"(SwissProt keyword)* for all chains for which consensus was accomplished 2) A log file ('pdb2sp_log') with various information for all chains and all sources of SwissProt info (sp, mia, dbref, man, blast, err) 3) Error logs written to STDERR =cut use strict; use Storable; # to store and retrieve hash of alignment scores use POM::Hash; # for information retrieval from POM use Bio::Index::Swissprot; use Bio::Seq; use Bio::SeqIO; use Bio::Tools::pSW; # BIOPERL tools # ------------- # # The following changes were made to the Bioperl distribution 0.6.2: # # 1) Bio::Index::Swissprot.pm # --------------------------- # # Downloaded Bio::Index::Swissprot.pm from bioperl's ftp site (bio.perl.org) # Copied module into /projects/rcsb/dev/usr/wbluhm/rcsb/local/lib/perl5/site_perl/5.6.0/Bio/Index # This was necessary because Bio::Index::EMBL didn't work properly with # Swissprot files for indexing and retrieval # Bio::Index::Swissprot.pm was not part of the packaged 0.6.2 distribution # # # 2) Bio::Index::AbstractSeq.pm (line 150) # ---------------------------------------- # # Commented out an exception for "no record found" since it resulted # in program termination. # Replaced it with an error message written to STDERR # # # 3) Bio::SimpleAlign.pm (line 681) # --------------------------------- # # Replaced method ary() with # method seq() followed by split into array. # ary() is deprecated. # This resulted in warning messages to STDERR for each sequence processed # $spinx is a SwissProt index object for Swissprot sequence retrieval # ------ # This makes an index object out of a SwissProt index file, # which in turn was generate by the following command line: # % perl indexSwiss.pl sptr /misc/pdbdev/wbluhm/swissprot/sprot.dat /misc/pdbdev/wbluhm/swissprot/trembl.dat # (must use absolute paths) # # The script itself ( /misc/pdbdev/wbluhm/swissprot/indexSwiss.pl is as follows: # ------------------------------------------------------------------------------ # use Bio::Index::Swissprot; # # my $Index_File_Name = shift; # my $inx = Bio::Index::Swissprot->new($Index_File_Name, 'WRITE'); # $inx->make_index(@ARGV); # ------------------------------------------------------------------------------ my $spinx = Bio::Index::Swissprot->new("/misc/pdbdev/wbluhm/swissprot/sptr"); my $fac = Bio::Tools::pSW->new(-matrix=>'blosum62.bla', -gap=>12, -ext=>2 ); # make alignment "factory" object my $MIN_SCORE_TO_ASSIGN_CHAIN = 98; # used in sub align() for assigning chain ID based # on the percent alignment between PDB and SP sequences my $BAD_SCORE_FOR_CONS = 60; # used in second part of consensus finding # anything below this won't even be considered my $GOOD_SCORE_FOR_CONS = 98; # used in second part of consensus finding my (%dbref, %n_enp, %chain_name, %chain_type, %dep_date, %seq) ; tie %dbref, "POM::Hash", "dbrefacc.enp", "OBSLTE"; tie %n_enp, "POM::Hash", "n_enp.com", "OBSLTE"; tie %chain_name, "POM::Hash", "name.enp", "OBSLTE"; tie %chain_type, "POM::Hash", "type.enp", "OBSLTE"; tie %dep_date, "POM::Hash", "date_tex.com", "OBSLTE"; tie %seq, "POM::Hash", "seq.enp", "OBSLTE"; # %scores is a hash for storing pairwise alignments # ------- # This hash is written to disk at the end of the script # under the name 'pdb2sptr_alignment_scores.bin' # and retrieved (if already present) at the beginning of the script # # That way, previously calculated alignments will be re-used # If it is explicitly desired to recalculate all alignments, # the file pdb2sptr_alignment_scores.bin' must be removed first my %scores; # hash with alignment scores my $scores_ref; # just a referene to the hash if ( -r "pdb2sptr_alignment_scores.bin" ) { $scores_ref = Storable::retrieve("pdb2sptr_alignment_scores.bin"); %scores = %$scores_ref; } my %chains; # the main hash my %sp_names; # hash with spids to names my %sp_synonyms; # synonyms my %sp_ids; # spid to spid, just to keep track of what I've come across my %sp_keywords; my %sp_species; my ($pdbseq, $pdbseqobj, $spseq, $spseqobj); # for sequence alignments my ($pdbid, $chain); my ($mia_entry, $sp_entry); my $src; my ($spid, $spac); my ($n_pdbids_total, $n_chains_wo_cons, $n_chains_total) = (0,0,0); my $n_chains_w_cons = 0; my $n_prot_chains = 0; my $n_prot_chains_wo_info = 0; my $n_chains_wo_name = 0; my $n_alignments = 0; my ( $n_scores, $average_score ) = (0,0); my @chains_w_pdb2sp_ali = (); my @fields; my @files = (); open ( FILES, "$ARGV[0]"); while () { chomp; push @files, $_; if ( ! -r $_ ) { print STDERR "Aborted -- file not found: $_\n"; exit; } } close FILES; open ( SP, "$files[0]" ); open ( SPMIA, "$files[1]" ); open ( SPDBREF, "$files[2]" ); open ( SPMAN, "$files[3]" ); open ( SPBLAST, "$files[4]" ); open ( SPDUPL, "$files[5]" ); open ( SPCORR, "$files[6]" ); open ( NAMES, "$files[7]" ); open ( SPECIES, "$files[8]" ); open ( KEYWORDS,"$files[9]" ); open ( OUT, ">pdb2sptr_out" ); open ( LOG, ">pdb2sptr_log" ); # ------------------------------------------- # Writing the header for the main output file # ------------------------------------------- print OUT "# This file was generated by\n"; print OUT "# $0 $ARGV[0]\n"; print OUT "# processing the following files:\n"; print OUT "# \n"; foreach ( @files ) { print OUT "# $_\n"; } print OUT "# \n"; print OUT "# The extracted records are:\n"; print OUT "# \n"; print OUT "# 'PDBID:Chain\"ID\"SwissProtIDalignmentname(synonym)*'\n"; print OUT "# 'PDBID:Chain\"SRC\"(latin source term)(common source term'\n"; print OUT "# 'PDBID:Chain\"KW\"(SwissProt keyword)*'\n"; print OUT "# \n"; print OUT "# for all protein chains where data have been consolidated\n"; print OUT "# \n"; # ------------------------------------------------- # Writing the header for the comprehensive log file # ------------------------------------------------- print LOG "# This file was generated by\n"; print LOG "# $0 $ARGV[0]\n"; print LOG "# processing the following files:\n"; print LOG "# \n"; foreach ( @files ) { print LOG "# $_\n"; } print LOG "# \n"; print LOG "# Chains are grouped under the following headings\n"; print LOG "# with the record types as indicated:\n"; print LOG "# \n"; print LOG "# 1) protein chains with complete consensus\n"; print LOG "# 'PDBID:ChainsourceSwissProtIDalignmentFirst_SwissProt_Name'\n"; print LOG "# \n"; print LOG "# 2) protein chains with SwissProt info but no consensus\n"; print LOG "# 'PDBID:ChainsourceSwissProtIDalignmentFirst_SwissProt_Name'\n"; print LOG "# \n"; print LOG "# 3) SwissProt info, unresolved chain id\n"; print LOG "# 'PDBID:???sourceSwissProtIDFirst_SwissProt_Name'\n"; print LOG "# \n"; print LOG "# 4) protein chains without any SwissProt info\n"; print LOG "# 'PDBID:Chain'\n"; print LOG "# \n"; print LOG "# 5) protein chains without consensus\n"; print LOG "# 'PDBID:Chain'\n"; print LOG "# \n"; print LOG "# 6) non-protein chains\n"; print LOG "# 'PDBID:Chainchain_type'\n"; print LOG "# \n"; print LOG "# 7) all alignment scores (regardless of consensus or source)\n"; print LOG "# 'PDBID:ChainSwissProtIDalignment\n"; print LOG "# \n"; print LOG "# A few statistics are listed at the end of the file\n"; print LOG "# \n"; print LOG "# ==================================================\n"; print LOG "# \n"; # ------------------------------------------- # Building up the main hash of all PDB chains # (with chain type info to start with) # ------------------------------------------- while ( @fields = each %chain_type) { # all chains in POM # ($pdbid,$chain) = split /:/, $fields[0]; # there are 69 cases of 'xxxx:x:x' # so can't just use a split any more $fields[0] =~ /^(\S{4}):(\S+)/; # to replace the simple split $pdbid = $1; $chain = $2; # next unless ( $dep_date{$pdbid} !~ /-99/ and # $dep_date{$pdbid} !~ /-00/ and # $dep_date{$pdbid} !~ /-01/ # ); # for once, skip everything processed before 01-JAN-99 chomp $fields[1]; $chains{$pdbid}{$chain}{type} = $fields[1]; } # ------------------------------------------- # Entering data from SwissProt into main hash # ------------------------------------------- print STDERR "Processing data from $files[0]\n"; while ( ) { next if ( /^\s*\#/ ); # skip comment lines next if ( ! /\S/ ); # skip blank lines chomp; @fields = split /\t/; $spid = $fields[1]; $sp_ids{$spid} = $spid; $pdbid = $fields[0]; if ( ! exists $chains{$pdbid} ) { print STDERR "pdbid $pdbid (from SP) doesn't exist\n"; next; } align(); # calculate alignments and enter them into the hash if ( $n_enp{$pdbid} == 1 ) { # only one chain ($chain) = keys %{$chains{$pdbid}}; # so, can safely assign chain id push @{$chains{$pdbid}{$chain}{sp}}, $fields[1]; } elsif ( all_chains_identical() ) { # all chains are identical foreach $chain ( keys %{$chains{$pdbid}} ) { push @{$chains{$pdbid}{$chain}{sp}}, $fields[1]; # enter sp info for all chains } } elsif ( only_one_protein_chain() ) { push @{$chains{$pdbid}{$chain}{sp}}, $fields[1]; # $chain was set in subroutine } elsif ( @chains_w_pdb2sp_ali ) { foreach $chain ( @chains_w_pdb2sp_ali ) { push @{$chains{$pdbid}{$chain}{sp}}, $fields[1]; # enter info for all aligned chains } } else { push @{$chains{$pdbid}{'???'}{sp}}, $fields[1]; # enter info for unassigned chain } } # ------------------------------------- # Entering data from MIA into main hash # ------------------------------------- print STDERR "Processing data from $files[1]\n"; while ( ) { next if ( /^\s*\#/ ); # skip comment lines next if ( ! /\S/ ); # skip blank lines chomp; @fields = split /\t/; $spid = $fields[1]; $sp_ids{$spid} = $spid; $pdbid = $fields[0]; if ( ! exists $chains{$pdbid} ) { print STDERR "pdbid $pdbid (from MIA) doesn't exist\n"; next; } align(); # calculate alignments and enter them into the hash if ( $n_enp{$pdbid} == 1 ) { # only one chain ($chain) = keys %{$chains{$pdbid}}; # so, can safely assign chain id push @{$chains{$pdbid}{$chain}{mia}}, $fields[1]; } elsif ( all_chains_identical() ) { # all chains are identical foreach $chain ( keys %{$chains{$pdbid}} ) { push @{$chains{$pdbid}{$chain}{mia}}, $fields[1]; # enter sp info for all chains } } elsif ( only_one_protein_chain() ) { push @{$chains{$pdbid}{$chain}{mia}}, $fields[1]; # $chain was set in subroutine } elsif ( @chains_w_pdb2sp_ali ) { foreach $chain ( @chains_w_pdb2sp_ali ) { push @{$chains{$pdbid}{$chain}{mia}}, $fields[1]; # enter info for all aligned chains } } else { push @{$chains{$pdbid}{'???'}{mia}}, $fields[1]; # enter info for unassigned chain } } # --------------------------------------- # Entering data from DBREF into main hash # --------------------------------------- print STDERR "Processing data from $files[2]\n"; while ( ) { next if ( /^\s*\#/ ); # skip comment lines next if ( ! /\S/ ); # skip blank lines chomp; @fields = split /\t/; $spid = $fields[1]; $sp_ids{$spid} = $spid; $fields[0] =~ /^(\S{4}):(\S+)/; $pdbid = $1; $chain = $2; # DBREF SP entries contain numerous invalid IDs with valid AC numbers if ( ! exists $scores{$pdbid}{$chain}{$spid} ) { # don't evaluate further if score already exists if ( ! ($spinx->fetch($spid)) ) { print STDERR "Invalid SP Id from DBREF: $spid\t"; # already error message from fetch $spac = ${$dbref{"$pdbid:$chain"}}[1]; # try AC number instead $spseqobj = $spinx->fetch($spac); if ( $spseqobj ) { $spid = $spseqobj->id(); # fix the SP ID $sp_ids{$spid} = $spid; print STDERR "Correct SP Id: $spid"; } } print STDERR "\n"; } if ( ! exists $chains{$pdbid} ) { print STERR "pdbid $pdbid (from DBREF) doesn't exist\n"; # this happened in one case (1FI1) when an old version of DBREF file was used } else { push @{$chains{$pdbid}{$chain}{dbref}}, $spid; align(); # careful, sub align() uses (and changes) $chain } } # --------------------------------------------- # Entering manual consensus data into main hash # --------------------------------------------- print STDERR "Processing data from $files[3]\n"; while ( ) { next if ( /^\s*\#/ ); # skip comment lines next if ( ! /\S/ ); # skip blank lines chomp; @fields = split /\t/; $spid = $fields[2]; $sp_ids{$spid} = $spid; $fields[0] =~ /^(\S{4}):(\S+)/; $pdbid = $1; $chain = $2; push @{$chains{$pdbid}{$chain}{man}}, $spid; # no need to call align() since these were all aligned from other sources already } # ----------------------------------------- # Entering manual Blast data into main hash # ----------------------------------------- print STDERR "Processing data from $files[4]\n"; while ( ) { next if ( /^\s*\#/ ); # skip comment lines next if ( ! /\S/ ); # skip blank lines chomp; @fields = split /\t/; $spid = $fields[2]; $sp_ids{$spid} = $spid; $fields[0] =~ /^(\S{4}):(\S+)/; $pdbid = $1; $chain = $2; push @{$chains{$pdbid}{$chain}{blast}}, $spid; align(); # careful, sub align() uses (and changes) $chain } print STDERR "Processing data from $files[5]\n"; while ( ) { next if ( /^\s*\#/ ); # skip comment lines next if ( ! /\S/ ); # skip blank lines chomp; @fields = split /\t/; $fields[2] =~ /^(\S{4}):(\S+)/; # this one's already in the hash $pdbid = $1; $chain = $2; if ( exists $chains{$pdbid}{$chain}{blast} ) { $spid = $chains{$pdbid}{$chain}{blast}[0]; $fields[0] =~ /^(\S{4}):(\S+)/; # this is the "new" one $chains{$1}{$2}{blast}[0] = $spid; $pdbid = $1; # now it's referring to the new one align(); } } # ---------------------------------------------- # Entering manual correction data into main hash # ---------------------------------------------- print STDERR "Processing data from $files[6]\n"; while ( ) { next if ( /^\s*\#/ ); # skip comment lines next if ( ! /\S/ ); # skip blank lines chomp; @fields = split /\t/; $fields[0] =~ /^(\S{4}):(\S+)/; $pdbid = $1; $chain = $2; if ( $fields[1] eq 'remove' ) { push @{$chains{$pdbid}{$chain}{rem}}, '--removed--'; } else { $spid = $fields[2]; $sp_ids{$spid} = $spid; push @{$chains{$pdbid}{$chain}{corr}}, $spid; align(); # careful, sub align() uses (and changes) $chain } } # --------------------------------------------- # Collecting names, synomyms, species, keywords # --------------------------------------------- while ( ) { next if ( /^\s*\#/ ); # skip comment lines next if ( ! /\S/ ); # skip blank lines chomp; @fields = split /\t/; $spid = $fields[0]; if ( exists $sp_ids{$spid} and ! exists $sp_names{$fields[0]} ) { $sp_names{$spid} = $fields[1]; foreach (2..$#fields) { push @{$sp_synonyms{$spid}}, $fields[$_]; } } else { # print STDERR "Ignored duplicate name entry for $fields[0] ($fields[1])\n"; # print STDERR "Kept previous name entry ($sp_names{$fields[0]})\n\n"; # can't do this with Trembl because ID and AC are always the same => 500,000 duplicate entries! } } while ( ) { next if ( /^\s*\#/ ); # skip comment lines next if ( ! /\S/ ); # skip blank lines chomp; @fields = split /\t/; $spid = $fields[0]; if ( exists $sp_ids{$spid} and ! exists $sp_species{$fields[0]} ) { foreach (1..$#fields) { push @{$sp_species{$spid}}, $fields[$_]; } } } while ( ) { next if ( /^\s*\#/ ); # skip comment lines next if ( ! /\S/ ); # skip blank lines chomp; @fields = split /\t/; $spid = $fields[0]; if ( exists $sp_ids{$spid} and ! exists $sp_keywords{$fields[0]} ) { foreach (1..$#fields) { push @{$sp_keywords{$spid}}, $fields[$_]; } } } # ========================================== # FINDING CONSENSUS OF DATA IN THE MAIN HASH # ========================================== print STDERR "Finding consensus\n"; # First: consensus finding purely on agreement between sources # (without using alignment scores) foreach $pdbid (keys %chains) { foreach $chain (keys %{$chains{$pdbid}} ) { next if $chain eq '???'; next if $chains{$pdbid}{$chain}{type} ne 'P'; if ( exists $chains{$pdbid}{$chain}{dbref} and exists $sp_names{$chains{$pdbid}{$chain}{dbref}[0]} ) { # valid dbref entry exists if ( exists $chains{$pdbid}{$chain}{sp} ) { # sp entry exists as well foreach ( @{$chains{$pdbid}{$chain}{sp}} ) { if ( $chains{$pdbid}{$chain}{dbref}[0] eq $_ ) { # match $chains{$pdbid}{$chain}{cons}[0] = $_; } } } elsif ( exists $chains{$pdbid}{'???'}{sp} ) { # unassigned sp entry exists as well foreach ( @{$chains{$pdbid}{'???'}{sp}} ) { if ( $chains{$pdbid}{$chain}{dbref}[0] eq $_ ) { # match $chains{$pdbid}{$chain}{cons}[0] = $_; } } } elsif ( exists $chains{$pdbid}{$chain}{mia} ) { # no sp, but mia foreach ( @{$chains{$pdbid}{$chain}{mia}} ) { if ( $chains{$pdbid}{$chain}{dbref}[0] eq $_ ) { # match $chains{$pdbid}{$chain}{cons}[0] = $_; } } } elsif ( exists $chains{$pdbid}{'???'}{mia} ) { # no sp, but unassigned mia foreach ( @{$chains{$pdbid}{'???'}{mia}} ) { if ( $chains{$pdbid}{$chain}{dbref}[0] eq $_ ) { # match $chains{$pdbid}{$chain}{cons}[0] = $_; } } } else { # only dbref exists $chains{$pdbid}{$chain}{cons}[0] = $chains{$pdbid}{$chain}{dbref}[0]; # dbref IS consensus! :-) } } else { # no valid dbref entry exists if ( exists $chains{$pdbid}{$chain}{sp} ) { # sp entry exists, chain assigned if ( $#{$chains{$pdbid}{$chain}{sp}} == 0 ) { # it's only one entry $chains{$pdbid}{$chain}{cons}[0] = $chains{$pdbid}{$chain}{sp}[0] if exists $sp_names{$chains{$pdbid}{$chain}{sp}[0]}; # name check may be unnecessary here # the single sp entry is consensus } } elsif ( exists $chains{$pdbid}{$chain}{mia} ) { # mia entry exists, chain assigned if ( $#{$chains{$pdbid}{$chain}{mia}} == 0 ) { # it's only one entry $chains{$pdbid}{$chain}{cons}[0] = $chains{$pdbid}{$chain}{mia}[0] if exists $sp_names{$chains{$pdbid}{$chain}{mia}[0]}; # name check is necessary here # the single mia entry is consensus } } } # other cases: if ( exists $chains{$pdbid}{$chain}{dbref} and exists $sp_names{$chains{$pdbid}{$chain}{dbref}[0]} and ! exists $chains{$pdbid}{$chain}{sp} and ! exists $chains{$pdbid}{'???'}{sp} and ! exists $chains{$pdbid}{$chain}{mia} and exists $chains{$pdbid}{'???'}{mia} and ! exists $chains{$pdbid}{$chain}{cons} # the unassigned mia entry alone prevented consensus ) { $chains{$pdbid}{$chain}{cons}[0] = $chains{$pdbid}{$chain}{dbref}[0]; } } } # Now: If the above didn't produce a consensus based on agreement between sources alone, # then let's consider throwing in some information about the alignment scores: foreach $pdbid (keys %chains) { foreach $chain (keys %{$chains{$pdbid}} ) { next if $chain eq '???'; next if $chains{$pdbid}{$chain}{type} ne 'P'; next if exists $chains{$pdbid}{$chain}{cons}; # consider only remaining non-consensus chains foreach $src ( ('dbref','sp','mia') ) { if ( exists $chains{$pdbid}{$chain}{$src} ) { foreach $spid ( @{$chains{$pdbid}{$chain}{$src}} ) { if ( exists $scores{$pdbid}{$chain}{$spid} and $scores{$pdbid}{$chain}{$spid} > $BAD_SCORE_FOR_CONS ) { push @{$chains{$pdbid}{$chain}{notbad}}, $spid; } } } } if ( $#{$chains{$pdbid}{$chain}{notbad}} == 0 ) { # only one not bad alignment $spid = $chains{$pdbid}{$chain}{notbad}[0]; if ( $scores{$pdbid}{$chain}{$spid} > $GOOD_SCORE_FOR_CONS ) { # and it's actually really good $chains{$pdbid}{$chain}{cons}[0] = $spid if exists $sp_names{$spid}; } } if ( $#{$chains{$pdbid}{$chain}{notbad}} == 1 ) { # only two alignments that are not outright bad if ( $chains{$pdbid}{$chain}{notbad}[0] eq $chains{$pdbid}{$chain}{notbad}[1] ) { # and they agree $spid = $chains{$pdbid}{$chain}{notbad}[0]; $chains{$pdbid}{$chain}{cons}[0] = $spid if exists $sp_names{$spid}; } } } } # Note on the following categories: # --------------------------------- # # Both "man" and "blast" were manually assigned for the chains where # all other consensus finding efforts were fruitless at the time. # However, they are only "adopted" if no consensus is already present. # Later, if additional information (from e.g. SwissProt or MIA) became available, # these "manual" entries would NOT overwrite any consensus that # might have been accomplished earlier on. # # In contrast, the "corr" category is manually derived information # which WILL deliberately overwrite other information to allow # the manual correction of knowingly wrong consensus data. # Very special care obviously has to be taken with this category. # enter the manually derived consensus information foreach $pdbid (keys %chains) { foreach $chain (keys %{$chains{$pdbid}} ) { if ( exists $chains{$pdbid}{$chain}{man} and ! exists $chains{$pdbid}{$chain}{cons} ) { # will NOT overwrite $chains{$pdbid}{$chain}{cons}[0] = $chains{$pdbid}{$chain}{man}[0]; } } } # enter what was manually derived from Blast reports foreach $pdbid (keys %chains) { foreach $chain (keys %{$chains{$pdbid}} ) { if ( exists $chains{$pdbid}{$chain}{blast} and ! exists $chains{$pdbid}{$chain}{cons} ) { #will NOT overwrite $chains{$pdbid}{$chain}{cons}[0] = $chains{$pdbid}{$chain}{blast}[0]; } } } # enter manually corrected data foreach $pdbid (keys %chains) { foreach $chain (keys %{$chains{$pdbid}} ) { if ( exists $chains{$pdbid}{$chain}{corr} ) { $chains{$pdbid}{$chain}{cons}[0] = $chains{$pdbid}{$chain}{corr}[0]; # WILL overwrite! print STDERR "$pdbid:$chain overwritten with SPID ", $chains{$pdbid}{$chain}{corr}[0], " \n"; } } } # manually remove chains foreach $pdbid (keys %chains) { foreach $chain (keys %{$chains{$pdbid}} ) { if ( exists $chains{$pdbid}{$chain}{rem} ) { delete( $chains{$pdbid}{$chain}{cons} ); # WILL delete! print STDERR "$pdbid:$chain removed from consensus\n"; } } } # ---------------------------------------- # Write consensus data to main output file # ---------------------------------------- foreach $pdbid (sort keys %chains) { foreach $chain (sort keys %{$chains{$pdbid}} ) { next if $chain eq '???'; if ( exists $chains{$pdbid}{$chain}{cons} ) { $spid = $chains{$pdbid}{$chain}{cons}[0]; # first line: id, names, etc. # --------------------------- print OUT "$pdbid:$chain\tID\t$spid\t"; if ( exists $scores{$pdbid}{$chain}{$spid} ) { print OUT sprintf("%.2f",$scores{$pdbid}{$chain}{$spid}),"\t"; $n_scores++; $average_score += $scores{$pdbid}{$chain}{$spid}; } else { print OUT "--no score--\t"; # this should never happen } if ( exists $sp_names{$spid} ) { print OUT "$sp_names{$spid}"; } else { print OUT "--no name--"; # this should never happen } if ( exists $sp_synonyms{$spid} ) { foreach ( @{$sp_synonyms{$spid}} ) { print OUT "\t$_"; } } print OUT "\n"; # second line: species info # ------------------------- print OUT "$pdbid:$chain\tSRC"; if ( exists $sp_species{$spid} ) { foreach ( @{$sp_species{$spid}} ) { print OUT "\t$_"; } } print OUT "\n"; # third line: keywords # -------------------- print OUT "$pdbid:$chain\tKW"; if ( exists $sp_keywords{$spid} ) { foreach ( @{$sp_keywords{$spid}} ) { print OUT "\t$_"; } } print OUT "\n"; } } } # ------------------------------------------- # Write comprehensive log file for all chains # ------------------------------------------- # The log file contains data for the following types of chains # # 1) protein chains with complete consensus # 2) protein chains with SwissProt info but no consensus # 3) SwissProt info, unresolved chain id # 4) protein chains without any SwissProt info # 5) protein chains without consensus # 6) non-protein chains # 7) all alignment scores (regardless of consensus or source) # # There are a few statistics at the end of the file # 1) protein chains with complete consensus # ----------------------------------------- print LOG "# 1) protein chains with complete consensus\n"; foreach $pdbid (sort keys %chains) { $n_pdbids_total++; foreach $chain (sort keys %{$chains{$pdbid}} ) { next if $chain eq '???'; $n_chains_total++; next if $chains{$pdbid}{$chain}{type} ne 'P'; $n_prot_chains++; if ( exists $chains{$pdbid}{$chain}{cons}) { # consensus $n_chains_w_cons++; foreach $src ( ('dbref','sp','mia','man','blast','corr') ) { if ( exists $chains{$pdbid}{$chain}{$src} ) { foreach $spid ( @{$chains{$pdbid}{$chain}{$src}} ) { print LOG "$pdbid:$chain\t$src\t$spid\t"; if ( exists $scores{$pdbid}{$chain}{$spid} ) { print LOG sprintf("%.2f",$scores{$pdbid}{$chain}{$spid}),"\t"; } else { print LOG "--no score--\t"; } if ( exists $sp_names{$spid} ) { print LOG "$sp_names{$spid}\n"; } else { print LOG "--no name--\n"; } } } } } } } # 2) protein chains with SwissProt info but no consensus # ------------------------------------------------------ print LOG "# 2) protein chains with SwissProt info but no consensus\n"; foreach $pdbid (sort keys %chains) { foreach $chain (sort keys %{$chains{$pdbid}} ) { next if $chain eq '???'; next if $chains{$pdbid}{$chain}{type} ne 'P'; if ( ! exists $chains{$pdbid}{$chain}{cons}) { # consensus $n_chains_wo_cons++; foreach $src ( ('dbref','sp','mia','rem') ) { if ( exists $chains{$pdbid}{$chain}{$src} ) { foreach $spid ( @{$chains{$pdbid}{$chain}{$src}} ) { print LOG "$pdbid:$chain\t$src\t$spid\t"; if ( exists $scores{$pdbid}{$chain}{$spid} ) { print LOG sprintf("%.2f",$scores{$pdbid}{$chain}{$spid}),"\t"; } else { print LOG "--no score--\t"; } if ( exists $sp_names{$spid} ) { print LOG "$sp_names{$spid}\n"; } else { print LOG "--no name--\n"; } } } } } } } # 3) SwissProt info, unresolved chain id # -------------------------------------- print LOG "# 3) SwissProt info, unresolved chain id\n"; foreach $pdbid (sort keys %chains) { foreach $chain (sort keys %{$chains{$pdbid}} ) { next if $chain ne '???'; foreach $src ( ('dbref','sp','mia') ) { if ( exists $chains{$pdbid}{$chain}{$src} ) { foreach $spid ( @{$chains{$pdbid}{$chain}{$src}} ) { print LOG "$pdbid:$chain\t$src\t$spid\t"; if ( exists $sp_names{$spid} ) { print LOG "$sp_names{$spid}\n"; } else { print LOG "--no name--\n"; } } } } } } # 4) protein chains without any SwissProt info # -------------------------------------------- print LOG "# 4) protein chains without any SwissProt info\n"; foreach $pdbid (sort keys %chains) { foreach $chain (sort keys %{$chains{$pdbid}} ) { next if $chain eq '???'; if ( $chains{$pdbid}{$chain}{type} eq 'P' # it's a protein and ! exists $chains{$pdbid}{$chain}{dbref} # no dbref sws info and ! exists $chains{$pdbid}{$chain}{sp} # no sp info (chain id assigned) and ! exists $chains{$pdbid}{'???'}{sp} # no sp info (chain id unassigned) and ! exists $chains{$pdbid}{$chain}{mia} # no mia info (chain id assigned) and ! exists $chains{$pdbid}{'???'}{mia} # no mia info (chain id unassigned) and ! exists $chains{$pdbid}{$chain}{blast} # nothing assigned from Blast reports ) { print LOG "$pdbid:$chain\n"; $n_prot_chains_wo_info++; } } } # 5) protein chains without consensus # ----------------------------------- print LOG "# 5) protein chains without consensus\n"; foreach $pdbid (sort keys %chains) { foreach $chain (sort keys %{$chains{$pdbid}} ) { next if $chain eq '???'; if ( $chains{$pdbid}{$chain}{type} eq 'P' # it's a protein and ! exists $chains{$pdbid}{$chain}{cons} # no dbref sws info ) { print LOG "$pdbid:$chain\n"; } } } # 6) non-protein chains # --------------------- print LOG "# 6) non-protein chains\n"; foreach $pdbid (sort keys %chains) { foreach $chain (sort keys %{$chains{$pdbid}} ) { next if $chain eq '???'; if ( $chains{$pdbid}{$chain}{type} ne 'P' ) { print LOG "$pdbid:$chain\t$chains{$pdbid}{$chain}{type}\n"; } } } # 7) all alignment scores (regardless of consensus or source) # ----------------------------------------------------------- print LOG "# 7) all alignment scores (regardless of consensus or source)\n"; foreach $pdbid (sort keys %scores) { foreach $chain (sort keys %{$scores{$pdbid}} ) { foreach $spid (sort keys %{$scores{$pdbid}{$chain}} ) { print LOG "$pdbid:$chain\t$spid\t", sprintf("%.2f",$scores{$pdbid}{$chain}{$spid}),"\n"; } } } $average_score /= $n_scores; print LOG "# ---------------------------------\n"; print LOG "# Statistics\n"; print LOG "# Total number of pdbids processed: $n_pdbids_total\n"; print LOG "# Total number of chains processed: $n_chains_total\n"; print LOG "# Number of protein chains: $n_prot_chains\n"; print LOG "# Protein chains with SwissProt consensus: $n_chains_w_cons\n"; print LOG "# Protein chains left without SwissProt consensus: $n_chains_wo_cons\n"; print LOG "# Protein chains without any SwissProt info: $n_prot_chains_wo_info\n"; print LOG "# New alignments (not previously calculated): $n_alignments\n"; print LOG "# Average alignment (in %) for consensus chains: ", sprintf("%.2f",$average_score),"\n"; close SP; close SPMIA; close SPDBREF; close NAMES; close SPMAN; close SPBLAST; close SPDUPL; close SPCORR; close SPECIES; close KEYWORDS; close OUT; close LOG; Storable::store \%scores, "pdb2sptr_alignment_scores.bin"; # storing the hash of alignment scores exit; # ----------- # SUBROUTINES # ----------- #----------------------- sub all_chains_identical { #----------------------- my @seqs = (); my $identical = 1; # all sequences identical unless proven otherwise below foreach ( keys %{$chains{$pdbid}} ) { push @seqs, $seq{$pdbid.":".$_}; # retrieve sequence, add to array } foreach ( 1..$#seqs ) { $identical = 0 if $seqs[$_] ne $seqs[0]; } return $identical; } #------------------------- sub only_one_protein_chain { #------------------------- my $n_p_chains = 0; my $only_protein_chain = ''; foreach ( keys %{$chains{$pdbid}} ) { if ( $chain_type{$pdbid.":".$_} eq 'P' ) { $only_protein_chain = $_; $n_p_chains++; } } $chain = $only_protein_chain if $n_p_chains == 1; return ( $n_p_chains == 1 )?1:0; } #-------- sub align { #-------- my $alignment; my $already_aligned = 0; @chains_w_pdb2sp_ali = (); if ( exists $scores{$pdbid} ) { # has this pdbid been aligned with anything before? foreach $chain ( keys %{$scores{$pdbid}} ) { if ( exists $scores{$pdbid}{$chain}{$spid} ) { # yes, it's been aligned with this spid $already_aligned = 1; } } } if ( ! $already_aligned ) { # not already in hash $spseqobj = $spinx->fetch($spid); # SP sequence object return if ! $spseqobj; # if couldn't fetch it # I changed Bio::Index::AbstractSeq::fetch # so it wouldn't just die as before # error message written to STDERR from fetch $spseq = $spseqobj->seq(); # SP sequence foreach $chain ( keys %{$chains{$pdbid}} ) { next if $chain eq '???'; # not sure whether this can still happen # needed it during earlier code development $pdbseq = $seq{$pdbid.":".$chain}; # pdb sequence $pdbseqobj = Bio::Seq->new(-seq=>$pdbseq); # pdb sequence object next if length($pdbseq) < 3; # I've seen "AA" cause a problem # such short chains are pretty meaningless anyhow next if length($spseq) < 3; # just in case, but I've never seen it next if $chain_type{"$pdbid:$chain"} ne 'P'; # aha, now here's a problem! $alignment = $fac->pairwise_alignment($pdbseqobj, $spseqobj); # pairwise alignment object $n_alignments++; $scores{$pdbid}{$chain}{$spid} = $alignment->percentage_identity(); # add to hash of alignment scores } } foreach $chain ( keys %{$scores{$pdbid}} ) { if ( exists $scores{$pdbid}{$chain}{$spid} and $scores{$pdbid}{$chain}{$spid} > $MIN_SCORE_TO_ASSIGN_CHAIN ) { push @chains_w_pdb2sp_ali, $chain; } } }