#! /usr/local/bin/perl # A simple Perl script for downloading the files # from any given Protein Data Bank update date. # # Type perldoc getPdbUpdate.pl for usage information, # or view the getPdbUpdate.html file. $VERSION = '1.0'; ####################### # Documentation (pod) # ####################### =head1 NAME getPdbUpdate.pl =head1 SYNOPSIS A simple Perl utility for downloading the files from any given Protein Data Bank update date. getPdbUpdate.pl dates getPdbUpdate.pl latest getPdbUpdate.pl 20020708 =head1 DESCRIPTION =head2 Overview getPdbUpdate.pl is a simple Perl program for downloading all files from the Protein Data Bank (PDB) associated with a particular update date, including coordinate files, structure factors, and nmr restraints. =head2 Requirements getPdbUpdate.pl requires either one of the following common download utilities. On most systems where Perl is installed, one or both of these utilities will already be present. 1) LWP::Simple, a simple Perl module for downloading files from the World Wide Web. LWP::Simple is part of the libwww-perl module, which is available from CPAN. Please see the link in the L section. 2) wget, a common Unix utility for downloading files from the World Wide Web. wget is also available for most Windows operating systems. Please see the link in the L section. =head2 Usage The program can be run in three different modes as follows: 1) Get a list of valid update dates: getPdbUpdate.pl dates In this mode, the program will only retrieve and print a list of all valid update dates. 2) Get the files from the latest update: getPdbUpdate.pl latest In this mode, the program will retrieve all files associated with the latest update. The files will be saved in some or all of the following directories underneath your current working directory: /added /modified /obsolete /models_added /models_modified /models_obsolete 3) Get the files from any given update: getPdbUpdate.pl e.g.: getPdbUpdate.pl 20020708 In this mode, the program will retrieve all files associated with the specified update date. Again, the files will be saved in the same directory structure, e.g.: 20020708/added 20020708/modified 20020708/obsolete 20020708/models_added 20020708/models_obsolete =head2 Notes 1) Please note that this program will write some or all of the following additional files relative to your current working directory: ls-lR wgettest.temp /added.pdb /added.sf /added.nmr /modified.pdb /modified.sf /modified.nmr /obsolete.pdb /obsolete.sf /obsolete.nmr /models_added.pdb /models_modified.pdb /models_obsolete.pdb If any of these files are present prior to running this program, they will be overwritten! A safe option would be to create a new directory first, and then run getPdbUpdate.pl, e.g.: mkdir newpdbfiles cd newpdbfiles getPdbUpdate.pl latest 2) If an added file (e.g. pdb1f5d.ent.Z from added.pdb) was obsoleted after the specified update date (20001212), then the program will issue a warning, and save the file in the obsolete directory (20001212/obsolete) on local disk instead of the added directory. 3) As of 1 July 2002, all theoretical models were moved to a dedicated area of the PDB FTP archive, and effectively all access to them was removed from the PDB Web interface. If a file (e.g. pdb1pln.ent.Z) represents a theoretical model, the program will issue a warning, and save the file in one of the models directories on local disk. =head1 VERSION This documentation refers to version 1.0 of getPdbUpdate.pl. I Version Date Comments 1.0 2002-07-16 First release =head1 AUTHOR Wolfgang Bluhm ( mail@wbluhm.com ) for the Protein Data Bank ( info@rcsb.org ) =head1 BUGS 1) Not really a bug, but if your perl locations happens to be different from /usr/local/bin/perl, simple run the program as perl getPdbUpdate latest 2) Be aware that neither LWP::Simple::getstore nor wget preserves the original time stamps of the files being downloaded. While this is probably of little consequence for your personal use, you should not serve files downloaded this way to the public through any kind of mirror site. =head1 SEE ALSO http://www.rcsb.org/pdb/ -- Protein Data Bank (PDB) home page ftp://ftp.rcsb.org/pub/pdb -- PDB FTP site http://www.rcsb.org/pdb/cgi/resultBrowser.cgi?Date::update=1 -- Last PDB Update ftp://ftp.rcsb.org/pub/pdb/software -- download page for this script and documentation http://www.cpan.org/modules/by-module/LWP/ -- libwww-perl download page http://www.gnu.org/software/wget/wget.html -- wget home page =head1 COPYRIGHT Copyright 2002 The Regents of the University of California All Rights Reserved Permission to use, copy, modify and distribute any part of this PDB software for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Those desiring to incorporate this PDB Software into commercial products or use for commercial purposes should contact the Technology Transfer Office, University of California, San Diego, 9500 Gilman Drive, La Jolla, CA 92093-0910, Ph: (858) 534-5815, FAX: (858) 534-7345. In no event shall the University of California be liable to any party for direct, indirect, special, incidental, or consequential damages, including lost profits, arising out of the use of this PDB software, even if the University of California has been advised of the possibility of such damage. The PDB software provided herein is on an "as is" basis, and the University of California has no obligation to provide maintenance, support, updates, enhancements, or modifications. The University of California makes no representations and extends no warranties of any kind, either implied or express, including, but not limited to, the implied warranties of merchantability or fitness for a particular purpose, or that the use of the pdb software will not infringe any patent, trademark or other rights. =cut ################ # main program # ################ # The URL of the Protein Data Bank FTP archive $pdbFtpUrl = "ftp://ftp.rcsb.org/pub/pdb/"; # Print usage info and quit if no command line argument was given if ( ! $ARGV[0] ) { &usage; exit(0); } # Read command line argument $updateDate = $ARGV[0]; # Print usage info and quit if command line argument was not # 'dates', 'latest', or an 8-digit date if ( $updateDate !~ /^\d{8}$/ and $updateDate !~ /^latest$/i and $updateDate !~ /^dates$/i ) { &usage; exit(0); } # Test whether we have LWP::Simple eval{use LWP::Simple;}; if ( ! $@) { # LWP::Simple loaded successfully print STDERR "Using LWP::Simple::getstore for file downloads\n"; $haveLWP = 1; } else { # If we don't have LWP::Simple, test whether we have wget eval{ system("wget -q -O wgettest.temp http://www.rcsb.org/pdb/index.html"); }; if ( ! $@ and -e "wgettest.temp" ) { # We have a working wget print STDERR "Using wget for file downloads\n"; $haveLWP = 0; } else { # We have neither LWP::Simple nor wget print STDERR "This program requires either LWP::Simple or wget\n"; exit(0); } } # Get the ls-lR file (list of FTP archive content) &get_lslR; # Parse the ls-lF file &parse_lslR; # Choose between the three valid program modes if ( $updateDate =~ /^latest$/i ) { $updateDate = $latest; print STDERR "Latest update date is $latest\n"; } elsif ( $updateDate =~ /^dates$/i ) { &printUpdateDates; exit(0); } elsif ( ! exists $dates{$updateDate} ) { print STDERR "$updateDate is not a valid update date\n"; &printUpdateDates; exit(0); } # Get the PDB files &getFiles; exit(0); ############### # subroutines # ############### # To sort numerically, instead of alphabetically sub numerically { $a <=> $b } # Get the ls-lR file (list of FTP archive content) # and save it to local disk sub get_lslR { print STDERR "Getting ls-lR file (content list of FTP archive)\n"; $lslR_Url = $pdbFtpUrl . "ls-lR"; eval{ &download($lslR_Url, "ls-lR"); }; if ( $@ or ! -e "ls-lR" ) { print STDERR "Could not download or save ls-lR file\n"; print STDERR "Trying one more time: "; eval{ &download($lslR_Url, "ls-lR"); }; if ( $@ or ! -e "$saveAs" ) { print STDERR "Sorry, still could not get it. Please try again later.\n"; exit(0); } else { print STDERR "Ok, got it this time!\n"; } } } # Parse ls-lR file # Survey content of the following FTP directories: # data/status/ # structures/{divided|obsolete}/{pdb|structure_factors|nmr_restraints} # structures/models/{current|obsolete}/pdb # Determine all update dates and the latest update date # sub parse_lslR { print STDERR "Parsing ls-lR file\n"; open (IN, ") { $line = $_; chomp $line; # update dates if ( $line =~ /data\/status\/(\d{8})/ ) { $date = $1; $latest = $date if ( $date > $latest ); $dates{$date} = 1; } # contents of each update elsif ( $date and $line =~ /^-r.*\s(\S+)$/ ) { $status{$date}{$1} = 1; } # experimental structures elsif ( $line =~ /structures\/(divided|obsolete)\/(pdb|structure_factors|nmr_restraints)\/(..):/ ) { $level1 = $1; $level2 = $2; $level3 = $3; } # experimental structures elsif ( $level1 and $line =~ /^-r.*\s(\S+)$/ ) { $ftpContent{$level1}{$level2}{$level3}{$1} = 1; } # theoretical models elsif ( $line =~ /structures\/models\/(current|obsolete)\/(pdb)\/(..):/ ) { $mlevel1 = $1; $mlevel2 = $2; $mlevel3 = $3; } # theoretical models elsif ( $mlevel1 and $line =~ /^-r.*\s(\S+)$/ ) { $modelContent{$mlevel1}{$mlevel2}{$mlevel3}{$1} = 1; } # reset elsif ( $line !~ /\S/ ) { $level1 = $level2 = $level3 = 0; $mlevel1 = $mlevel2 = $mlevel3 = 0; $date = 0; } } close IN; } # Print all update dates sub printUpdateDates { print STDERR "Valid update dates are:\n"; foreach ( reverse sort numerically keys %dates ) { $n++; print STDOUT "$_"; $delimeter = ( $n%8 ? " " : "\n" ); print STDOUT $delimeter; } } # Get the PDB file lists # (added.pdb etc.) sub getFiles { # Create the update date directory if necessary if ( ! -e "$updateDate" ) { eval{ mkdir "$updateDate", 0777; }; if ( $@ ) { print STDERR "Could not make directory $updateDate\n"; return; } } # The following are the files in each data/status/ directory # which in turn list all the PDB files associated with that update @fileLists = ( "added.pdb", "added.sf", "added.nmr", "modified.pdb", "modified.sf", "modified.nmr", "obsolete.pdb", "obsolete.sf", "obsolete.nmr", "models_added.pdb", "models_modified.pdb", "models_obsolete.pdb" ); print STDERR "Starting file downloads\n"; foreach $fileList ( @fileLists ) { # Was the fileList (e.g. models_added.pdb) included # in the status/ directory for the update date? if ( exists $status{$updateDate}{$fileList} ) { print STDERR "Getting $fileList\n"; } else { print STDERR "No $fileList file for update of $updateDate\n"; next; } # Get the file list from FTP and save to local disk $fileListUrl = $pdbFtpUrl . "data/status/$updateDate/$fileList"; $saveAs = "$updateDate" . "/" . "$fileList"; eval{ &download($fileListUrl, $saveAs); }; # Provided the file list was successfully downloaded and saved, # get each file listed and save to local disk if ( $@ or ! -e $saveAs ) { print STDERR "Could not download or save file list $updateDate/$fileList\n"; print STDERR "Trying one more time: "; eval{ &download($fileListUrl, $saveAs); }; if ( $@ or ! -e "$saveAs" ) { print STDERR "Sorry, still could not get it. "; print STDERR "The download will be missing the files from this list.\n"; } else { print STDERR "Ok, got it this time!\n"; } } else { open (IN, "$saveAs"); while () { chomp; if ( /^-r.*\s(\S+)$/ ) { $fileName = $1; &getFile($fileList,$fileName); } } close IN; } } } # Attempt to get an individual PDB file # (coordinates, structure factors, or nmr restraints) sub getFile($$) { my $fileList = shift; my $fileName = shift; # Some files are listed without the '.Z' extension # (e.g. in added.pdb), even though all files in the # FTP archive have the '.Z' extention if ( $fileName !~ /\.Z$/ ) { $fileName .= ".Z"; } # Determine the type of file (coordinates, structure factors, nmr restraints) # and the hash directory if ( $fileName =~ /^pdb.(..).\.ent/ ) { # coordinate file $fileType = "pdb"; $hashDir = $1; } elsif ( $fileName =~ /^r.(..).sf\.ent/ ) { # structure factor file $fileType = "structure_factors"; $hashDir = $1; } elsif ( $fileName =~ /^.(..).\.mr/ ) { # nmr constraint file $fileType = "nmr_restraints"; $hashDir = $1; } else { print STDERR "Could not parse the type of file $fileName\n"; return; } # Set the directory under which the file will be saved on local disk $dirName = $updateDate; $fileList =~ /^(\S+)\./; $subDirName = $1; # Assemble FTP download URL # # Was the file listed under the corresponding divided (current) directory? if ( exists $ftpContent{'divided'}{$fileType}{$hashDir}{$fileName} ) { $fileUrl = $pdbFtpUrl . "data/structures/divided/$fileType/$hashDir/$fileName"; } # Was the file listed under the corresponding obsolete directory? elsif ( exists $ftpContent{'obsolete'}{$fileType}{$hashDir}{$fileName} ) { $fileUrl = $pdbFtpUrl . "data/structures/obsolete/$fileType/$hashDir/$fileName"; $subDirName = "obsolete"; if ( $fileList !~ /obsolete/ ) { print STDERR "WARNING: file $fileName from list $fileList"; print STDERR " has been obsoleted since $updateDate\n"; } } # Current model? elsif ( exists $modelContent{'current'}{$fileType}{$hashDir}{$fileName} ) { $fileUrl = $pdbFtpUrl . "data/structures/models/current/$fileType/$hashDir/$fileName"; $subDirName = "models_" . $subDirName unless ( $subDirName =~ /models/ ); if ( $fileList !~ /models/ ) { print STDERR "WARNING: file $fileName from list $fileList"; print STDERR " is a theoretical model\n"; } } # Obsolete model? elsif ( exists $modelContent{'obsolete'}{$fileType}{$hashDir}{$fileName} ) { $fileUrl = $pdbFtpUrl . "data/structures/models/obsolete/$fileType/$hashDir/$fileName"; $subDirName = "models_obsolete"; if ( $fileList !~ /models/ ) { print STDERR "WARNING: file $fileName from list $fileList"; print STDERR " is a theoretical model\n"; } if ( $fileList !~ /obsolete/ ) { print STDERR "WARNING: file $fileName from list $fileList"; print STDERR " has been obsoleted since $updateDate\n"; } } else { print STDERR "ERROR: Could not find file $fileName from list $fileList"; return; } # Set the name under which the file will be saved on local disk $saveAs = "$dirName/$subDirName/$fileName"; # Create sub directory (added, modified, obsolete) if necessary if ( ! -e "$DirName/$subDirName" ) { eval{ mkdir "$dirName/$subDirName", 0777; }; if ( $@ ) { print STDERR "Could not make directory $dirName/$subDirName\n"; return; } } # Get file from FTP and save to local disk print STDERR "$fileUrl -> $saveAs\n"; eval{ &download($fileUrl, $saveAs); }; if ( $@ or ! -e $saveAs ) { print STDERR "Could not download or save file $fileName from list $updateDate/$fileList\n"; print STDERR "Trying one more time: "; eval{ &download($fileUrl, $saveAs); }; if ( $@ or ! -e "$saveAs" ) { print STDERR "Sorry, still could not get it.\n"; print STDERR "The download will be missing this file.\n"; } else { print STDERR "Ok, got it this time!\n"; } } } # Download a file and save to local disk # Uses LWP::Simple::getstore if available, or wget otherwise. # Program would have exited earlier if neither had been available. sub download($) { my $url = shift; my $saveAs = shift; if ( $haveLWP ) { getstore($url, $saveAs); } else { system("wget -q -O $saveAs $url"); } } # Print usage information sub usage { print STDERR "Usage: getPdbUpdate.pl latest\n"; print STDERR " or: getPdbUpdate.pl dates\n"; print STDERR " or: getPdbUpdate.pl \n"; print STDERR " eg: getPdbUpdate.pl 20020603\n"; }