#!/usr/bin/env python # -*- coding: ISO-8859-1 -*- # # TODO: Improve upon extension recognition by checking for mismatches in found targets # and specified local file. # # imports {{{ import os,sys,re import urllib from HTMLParser import HTMLParser from urllib import FancyURLopener # }}} class MyHTMLParser(HTMLParser): #{{{ def __init__(self, pattern): HTMLParser.__init__(self) self.matcher = re.compile(pattern) self.targets = [] def handle_starttag(self, tag, attrs): for i in attrs: if "href" == i[0] and str(self.matcher.match(i[1])) != "None": self.targets.append(i[1]) #}}} def main(argv=None): # {{{ # Separates the URL into a directory and the file or pattern based on the # last appearance of '/'. if len(sys.argv) > 1: pivot = sys.argv[1].rfind("/") url = (sys.argv[1])[:pivot] pivot += 1 find = (sys.argv[1])[pivot:] else: print "******************************************************************************************************************************" print "* Invalid input! *" print "* *" print "* Try: 'DownloadExternalPackage.py url [localFile]' *" print "* *" print "* Where 'URL' is the URL with an explicit package name or the URL followed by the truncated package name. And 'localFile' is *" print "* the file name (including extension) that you would like to save as. *" print "* *" print "* Examples: *" print "* *" print "* DownloadExternalPackage.py 'http://issm.jpl.nasa.gov/files/externalpackages/petsc-2.3.2-p3.tar.gz' 'petsc-2.3.2-p3.tar.gz' *" print "* *" print "* This is the old style and the safest way to download a package. *" print "* *" print "* DownloadExternalPackage.py 'http://issm.jpl.nasa.gov/files/externalpackages/libtool' 'libtool.tar.gz' *" print "* *" print "* This is the new style. For packages like 'Libtool', which we never expect to be using multiple versions, this will *" print "* download the most recent version and save it as the generic 'libtool.tar.gz'. *" print "* *" print "* DownloadExternalPackage.py 'http://issm.jpl.nasa.gov/files/externalpackages/gsl-1.' 'gsl-1.15.tar.gz' *" print "* *" print "* This is the new style. This is a demonstration of how this script can be used to disambiguate a package name if there *" print "* are more than once package matching 'gsl-'. *" print "* *" print "* DownloadExternalPackage.py 'http://issm.jpl.nasa.gov/files/externalpackages/libtool' *" print "* *" print "* This is the new style. This will download a package with 'libtool' as a prefix and save it as its canonical name. *" print "* *" print "* *" print "******************************************************************************************************************************" if len(sys.argv) > 2: localFile=sys.argv[2] print "Downloaded file will be saved as: " + localFile else: localFile = None print "Downloaded file will saved with the same file name." print "Looking for: " + find # As an extra precaution, if no extension is given for a particular package # such as '.../libtool', then ensure that files found are of appropriate # file extensions. # # WARNING: The external packages directory includes executable binaries with # '.exe' extensions. As such, '.exe' is an acceptable suffix, but this is # inherently dangerous since this script can be used to download from any # valid website. Furthermore, if an individual attempts a "man-in-the-middle" # attack, then the user would be capable of downloading executables from # an untrusted source. pattern = find + "[\w.-]*(\.tar\.gz|tar\.gz2|tgz|zip|exe)?" parser = MyHTMLParser(pattern) # Creates a 'FancyURL' which allows the script to fail gracefully by catching # HTTP error codes 30X and several 40X(where 'X' is a natural number). urlObject = FancyURLopener() obj = urlObject.open(url) parser.feed(obj.read()) # If a file pattern was used to describe the file that should be downloaded, # then there is the potential for multiple file matches. Currently, the script # will detect this ambiguity and print out all the matches, while informing # the user that he must refine his search. # # TODO: Prompt the user to select from a list his/her preferred target. if len(parser.targets) > 1: print "Could not resolve your download due to the number of hits." print "Refine your search." for i in parser.targets: print i elif len(parser.targets) == 1: print "Found: " + parser.targets[0] url += "/" + parser.targets[0] if localFile is None: if os.path.exists(parser.targets[0]): print "File " + parser.targets[0] + " already exists and will not be downloaded..." else: urllib.urlretrieve(url, parser.targets[0]) print "File saved as: " + parser.targets[0] else: if os.path.exists(localFile): print "File "+ localFile +" already exists and will not be downloaded..." else: if parser.targets[0] == localFile: print "File found and destination match." elif parser.matcher.match(localFile) != "None": print "File found matches destination pattern." else: print "WARNING: the file found \'" + parser.targets[0] + "\' does not match \'" + localFile + "\'" print "Ensure the downloaded version is suitable." urllib.urlretrieve(url, localFile) print "File saved as: " + localFile else: print "No matches found!" obj.close() # End 'main' function. }}} if __name__ == "__main__": main()