webutils.python

How To/python/MyMiscUtils/
# -*- coding: utf-8 -*-
 
import httplib, urlparse, urllib, os
import shutil
import tempfile
import random
 
WU_SAYLOUD = True # deprecated - not used
 
 
class WUContentLengthError(Exception):pass
class WUURLError(Exception):pass
class WURedirectLimitError(Exception):pass
 
 
def createtempname(filename=''):
    return os.path.join(
        tempfile.gettempdir(),
        ''.join([
            'tmp_webutils_',
            ''.join(random.sample('qwertyuiopasdfghjklzxcvbnm11234567890',10)),
            filename
        ])
    )
 
def getContentType(url, allowed = []):
    """
    mixed getContentType(str url [, list allowed])
    Returns Content-Type of url. If list of allowed is not empty returns
    True if mime is in list or False otherwise.
    """
    p = urlparse.urlparse(url) 
    h = httplib.HTTPConnection(p[1])
    h.request('HEAD',p[2]) 
    resp = h.getresponse()
    ct = resp.getheader('Content-Type').split(';')[0]
    if len(allowed) > 0:
        if ct in allowed:
            return True
        return False
    return ct
### /getContentType
 
def getFile(url, saveAs, minSize=0, data=None):
    """
    getFile(str URL, str saveAs[, int minSize]) -> boolean
    downloadx URL given and saves it as saveAs.
    minimumFileSize is optional. You can set a minimum file size if server doesnt
    respond with standard 404 (not found) code but with 200 (ok) code and some kind
    of message like "image not found" or something like that.
    """
 
    p = urlparse.urlparse(url) 
    h = httplib.HTTPConnection(p[1])
    h.request('HEAD',p[2]) 
    resp = h.getresponse()
 
    if resp.status == 200:
        if minSize > 0:
            if int(resp.getheader('content-length')) < int(minSize):
                raise WUContentLengthError('Content-length header\'s value ' \
                'is %s (%s required)' % (resp.getheader('content-length'), minSize))
 
    try:
        tempfile = createtempname(os.path.splitext(saveAs)[1])
        opener = urllib.FancyURLopener()
        if data == None:
            opener.retrieve(url, tempfile)
        else:
            if type(data) is not str:
                data = urllib.urlencode(data)
            opener.retrieve(url, tempfile, data=data)
    except IOError:
        os.remove(tempfile)
        print 'can\'t get %s' % url
        return False
    except:
        os.remove(tempfile)
        raise
 
    if os.path.getsize(tempfile) < minSize:
        os.remove(tempfile)
        raise WUContentLengthError('Downloaded file\'s size is ' \
        'is %s (%s required)' % (os.path.getsize(tempfile), minSize))
    else:
        shutil.copy(tempfile, saveAs)
        os.remove(tempfile)
        return True 
### /getFile
 
def resourceExists(url, allowRedirects=False, limit=10):
    """ Checks if resource exists """
    while limit > 0:
        p = urlparse.urlparse(url)
 
        if p[2] == '':
            getPath = '/'
        else:
            getPath = p[2]
 
        if p[4] != '': # GET query
            getPath += '?%s' % p[4]
 
        h = httplib.HTTPConnection(p[1])
        h.request('HEAD',getPath)
        resp = h.getresponse()
 
        if resp.status == 200:
            return True
        elif allowRedirects:
            if resp.getheader('location') != None:
                url = resp.getheader('Location')
                limit -= 1
            else:
                return False
        else:
            return False
    raise WURedirectLimitError('Redirect Limit (%s) exceeded.' % limit)
### /resourceExists
 
def getStatus(url):
    p = urlparse.urlparse(url)
 
    if p[2] == '':
        getPath = '/'
    else:
        getPath = p[2]
 
    if p[4] != '': # GET query
        getPath += '?%s' % p[4]
 
    h = httplib.HTTPConnection(p[1])
    h.request('HEAD',getPath)
    resp = h.getresponse()
 
    return resp.status
### /getStatus
 
def getTrueUrl(url, limit=10):
    """ Gets true url (after all redirects) """
    while limit > 0:
        p = urlparse.urlparse(url)
 
        if p[2] == '':
            getPath = '/'
        else:
            getPath = p[2]
 
        if p[4] != '': # GET query
            getPath += '?%s' % p[4]
 
        h = httplib.HTTPConnection(p[1])
        h.request('HEAD',getPath)
        resp = h.getresponse()
 
        if resp.status == 200:
            return url
        else:
            if resp.getheader('location') != None:
                url = resp.getheader('Location')
                limit -= 1
            else:
                raise WUURLError('%s status is <%s>' % (url, resp.status))
 
    raise WURedirectLimitError('Redirect Limit (%s) exceeded.' % limit)
### /resourceExists
 
class FileDownloader:
    """
    FileDownloader
 
    fd = FileDownloader()
    fd.download(url, saveAs, minimumSize = 0) -> boolean
 
    Methods that you can override:
 
    FileDownloader.fileIsTooSmall(url, fileName) -> boolean
    Is called if downloaded file is smaller than minimumSize.
    Should return False.
 
    FileDownloader.downloadCompleted(url, fileName) -> boolean
    Is called when download is complete and file
    size is bigger than minimumSize.
    Should return True.
 
 
    FileDownloader.onFinish(validResponseCode, validSize, validCompleted) -> boolean
    Should return False if any of passed arguments is false or True otherwise.
    """
    def __init__(self):
        pass
 
    def download(self, url, saveAs, minSize = 0):
        self.validSize = True
        self.validCompleted = False
 
        p = urlparse.urlparse(url) 
        h = httplib.HTTPConnection(p[1])
        h.request('HEAD',p[2]) 
        r = h.getresponse()
        self.responseCode = r.status
        h.close()
 
        self.__getFile(url, saveAs, minSize)
 
        return self.onFinish()
 
    def __getFile(self, url, saveAs, minSize):
 
        try:
            tempfile = createtempname(os.path.splitext(saveAs)[1])
            opener = urllib.FancyURLopener()
            res = opener.retrieve(url, tempfile)
        except IOError:
            print 'cant get %s' % url
            return False
 
        if os.path.getsize(tempfile) < minSize:
            os.remove(tempfile)
            self.validSize = self.fileIsTooSmall(url, saveAs)
            return False
        else:
            shutil.copy(tempfile, saveAs)
            os.remove(tempfile)
            self.validCompleted = self.downloadCompleted(url, saveAs)
            return True
 
    # hooks
    def fileIsTooSmall(self, url, fileName):
        print 'TOO SMALL!'
        return False
 
    def downloadCompleted(self, url, fileName):
        print 'DOWNLOADED!'
        return True
 
    def onFinish(self):
        print 'Exit'
        if not self.validSize or not self.validCompleted:
            return False
        else:
            return True
### /FileDownloader