# Written by Cameron Dale
# see LICENSE.txt for license information
#
# $Id: HTTPCache.py 268 2007-08-18 23:45:45Z camrdale-guest $

"""Manage an HTTP download cache.

@type logger: C{logging.Logger}
@var logger: the logger to send all log messages to for this module
@type time_format: C{string}
@var time_format: the format to use for reading/writing HTTP server times
@type VERSION: C{string}
@var VERSION: the UserAgent identifier sent to all sites
@type alas: C{string}
@var alas: the message to send when the data is not found
@type TIMEOUT: C{float}
@var TIMEOUT: the number of seconds after which an idle connection is closed

"""

from httplib import HTTPConnection, BadStatusLine
from socket import gaierror
from threading import Thread
from DebTorrent.__init__ import product_name,version_short
from clock import clock
from os.path import join, split, getmtime, getsize, exists
from os import utime, makedirs, listdir
from time import strftime, strptime, gmtime
from calendar import timegm
import logging

logger = logging.getLogger('DebTorrent.HTTPCache')

time_format = '%a, %d %b %Y %H:%M:%S'
VERSION = product_name+'/'+version_short
alas = 'your file may exist elsewhere in the universe\nbut alas, not here\n'
TIMEOUT = 60.0

class CacheRequest:
    """A new request to send to the server for the cache.
    
    @type path: C{list} of C{string}
    @ivar path: the server and path to download
    @type func: C{method}
    @ivar func: the method to call when the download completes
    @type response: (C{int}, C{string}, C{dictionary}, C{string})
    @ivar response: the HTTP status code, status message, headers, and
        downloaded data
    
    """
    
    def __init__(self, path, func):
        """Initialize the instance.
        
        @type path: C{list} of C{string}
        @param path: the server and path to download
        @type func: C{method}
        @param func: the method to call when the download completes
        
        """
        
        self.path = path
        self.func = func
        self.response = None
        
    def save_response(self, r):
        """Save a returned response from the server.
        
        @type r: C{httplib.HTTPResponse}
        @param r: the response from the server
        
        """
        
        self.response = (r.status, r.reason, dict(r.getheaders()), r.read())
        
    def error(self, error_msg):
        """Save an error response.
        
        @type error_msg: C{string}
        @param error_msg: the error that occurred
        
        """
        
        self.response = (502, 'Bad Gateway', {},
                         'error accessing http server: '+error_msg)

class CacheConnection:
    """Download files needed for the HTTP download cache from a single server.
    
    @type handler: L{HTTPCache}
    @ivar handler: the cache manager for the download
    @type server: C{string}
    @ivar server: the webserver address and port to connect to 
    @type request: L{CacheRequest}
    @ivar request: the request currently in progress
    @type request_queue: C{list} of L{CacheRequest}
    @ivar request_queue: the waiting requests
    @type connection: C{HTTPConnection}
    @ivar connection: the connection to the HTTP server
    @type url: C{string}
    @ivar url: the URL to request from the site
    @type headers: C{dictionary}
    @ivar headers: the HTTP headers to send in the request
    @type active: C{boolean}
    @ivar active: whether there is a download underway
    @type closed: C{boolean}
    @ivar closed: whether ther connection has been closed
    @type last_action: C{float}
    @ivar last_action: the last time an action occurred
    
    """
    
    def __init__(self, handler, server):
        """Initialize the instance.
        
        @type handler: L{HTTPCache}
        @param handler: the cache manager for the download
        @type server: C{string}
        @param server: the server name to send the requests to
        
        """
        
        self.handler = handler
        self.server = server
        self.request = None
        self.request_queue = []
        self.headers = {'User-Agent': VERSION}
        self.active = False
        self.closed = False
        self.last_action = clock()

        try:
            self.connection = HTTPConnection(self.server)
        except:
            logger.exception('cannot connect to http server: '+self.server)
            self.close()

    def queue(self, path, func):
        """Queue a download for later starting.
        
        @type path: C{list} of C{string}
        @param path: the server and path to download
        @type func: C{method}
        @param func: the method to call when the download completes
        @rtype: C{boolean}
        @return: whether the download was successfully queued
        
        """
        
        assert path[0] == self.server
        if self.closed:
            return False

        logger.debug('queueing request for '+'/'.join(path))
        self.request_queue.append(CacheRequest(path, func))
        self._run_queue()

        return True
        
    def _run_queue(self):
        """Start the next element in the queue downloading."""
        
        # Check if one is already running
        if self.active or self.closed:
            return
        
        # If the queue is empty, then we are done
        if not self.request_queue:
            self.handler.rawserver.add_task(self.auto_close, int(TIMEOUT)+1)
            return
        
        self.active = True
        self.last_action = clock()
        self.request = self.request_queue.pop(0)
        self.url = '/' + '/'.join(self.request.path[1:])
        logger.debug('starting thread to download '+self.url)
        rq = Thread(target = self._request, name = 'CacheRequest('+self.server+')')
        rq.setDaemon(False)
        rq.start()

    def _request(self):
        """Do the request."""
        import encodings.ascii
        import encodings.punycode
        import encodings.idna
        
        try:
            logger.debug('sending request GET '+self.url+', '+str(self.headers))
            self.connection.request('GET', self.url, None, self.headers)
            
            # Check for closed persistent connection due to server timeout
            try:
                r = self.connection.getresponse()
            except BadStatusLine:
                # Reopen the connection to get a new socket
                logger.debug('persistent connection closed, attempting to reopen')
                self.connection.close()
                self.connection.connect()
                logger.debug('sending request GET '+self.url+', '+str(self.headers))
                self.connection.request('GET',self.url, None, self.headers)
                r = self.connection.getresponse()
                
            logger.debug('got response '+str(r.status)+', '+r.reason+', '+str(r.getheaders()))
            self.request.save_response(r)
        except gaierror, e:
            logger.warning('could not contact http server '+self.server+': '+str(e))
            self.request.error('could not contact http server '+self.server+': '+str(e))
        except Exception, e:
            logger.exception('error accessing http server')
            self.request.error(str(e))
        self.last_action = clock()
        self.handler.rawserver.add_task(self.request_finished)

    def request_finished(self):
        """Process the completed request."""
        
        # Save the result
        request = self.request
        self.request = None
        
        # Start the next queued item running
        self.active = False
        self._run_queue()
        
        # Return the result
        self.handler.download_complete(request.path, request.func,
                                       request.response)
        
    def auto_close(self):
        """Close the connection if it has been idle."""
        if (not self.active and not self.closed and not self.request and 
            not self.request_queue and (clock() - self.last_action) >= TIMEOUT):
            self.close()
    
    def close(self):
        """Close the connection."""
        logger.info('Closing the connection to: '+self.server)
        self.closed = True
        self.connection.close()
        
        # Process the current request
        if self.request:
            if not self.request.response:
                self.request.error('connection closed prematurely')
            self.handler.download_complete(self.request.path,
                                           self.request.func,
                                           self.request.response)
            self.request = None
        
        # Process any waiting requests
        for request in self.request_queue:
            if not request.response:
                request.error('connection closed prematurely')
            self.handler.download_complete(request.path, request.func,
                                           request.response)
        del self.request_queue[:]
        
        # Remove the connection to the server
        self.handler.remove(self, self.server)


class HTTPCache:
    """Manage an HTTP download cache.
    
    @type rawserver: L{Debtorrent.RawServer.RawServer}
    @ivar rawserver: the server
    @type downloads: C{dictionary}
    @ivar downloads: the current downloads, keys are the server names, values
        are the L{CacheConnection} objects used to download from the server
    @type cachedir: C{string}
    @ivar cachedir: the directory to save cache files in
    
    """
    
    def __init__(self, rawserver, cachedir):
        """Initialize the instance.
        
        @type rawserver: L{Debtorrent.RawServer.RawServer}
        @param rawserver: the server
        @type cachedir: C{string}
        @param cachedir: the directory to save cache files in
        
        """
        
        self.rawserver = rawserver
        self.downloads = {}
        self.cachedir = cachedir

    def download_get(self, path, func):
        """Create a new download from a site.
        
        @type path: C{list} of C{string}
        @param path: the server and path to download
        @type func: C{method}
        @param func: the method to call with the data when the download is complete
        
        """
        
        if path[0] not in self.downloads:
            logger.info('Opening a connection to server: '+path[0])
            self.downloads[path[0]] = CacheConnection(self, path[0])

        if not self.downloads[path[0]].queue(path, func):
            func(path, (500, 'Internal Server Error', 
                        {'Server': VERSION, 
                         'Content-Type': 'text/html; charset=iso-8859-1'},
                        'Server could not be contacted'))

    def remove(self, d, server):
        """Remove a completed download connection.
        
        @type d: L{CacheConnection}
        @param d: the server connection that is no longer needed
        @type server: C{string}
        @param server: the server the connection was to
        
        """
        
        assert self.downloads[server] == d
        del self.downloads[server]

    def download_complete(self, path, func, r):
        """Process the returned data from a request.
        
        Once a download has been completed, save the downloaded file in the
        file system. Then return the data to the callback function.
        
        @type path: C{list} of C{string}
        @param path: the server and path that was downloaded
        @type func: C{method}
        @param func: the method to call with the data
        @type r: (C{int}, C{string}, C{dictionary}, C{string})
        @param r: the HTTP status code, status message, headers, and downloaded data
        
        """
        
        logger.info('download completed for: http://'+'/'.join(path))

        file = self.get_filename(path)
        headers = {'Server': VERSION}

        if r[0] == 200:
            # Create the directory for the new file
            new_dir = split(file)[0]
            if new_dir != '' and not exists(new_dir):
                makedirs(new_dir)
            
            # Write the new file
            f = open(file, 'wb')
            f.write(r[3])
            f.close()
            
            # Set the modified time (on error use current time which should work)
            try:
                mtime = timegm(strptime(r[2]['last-modified'], time_format + ' %Z'))
                times = (mtime, mtime)
                utime(file, times)
            except:
                logger.exception('Failed to set the cache time for the file')

        # Use the headers we want
        if exists(file):
            mtime_string = strftime(time_format + ' GMT', gmtime(getmtime(file)))
            headers['last-modified'] = mtime_string
        
        for k, v in r[2].items():
            if k in ('last-modified', 'content-type'):
                headers[k] = v
        
        # Call the callback function
        func(path, (r[0], r[1], headers, r[3]))

    def cache_get(self, path, uptodate = False, if_modified_time = ''):
        """Get the file from the cache.
        
        Will respond with the following HTTP status codes:
            - 200: the file was found in the cache and is up to date
            - 304: the file is up to date, but is not needed
            - 404: the file was not found in the cache
            - 405: the file was found, but is stale, and needs to be refreshed
        
        @type path: C{list} of C{string}
        @param path: the server and path to download
        @type uptodate: C{boolean}
        @param uptodate: whether to check the age of the file on the server to 
            see if the cached one is still current (optional, defaults to False)
        @type if_modified_time: C{string}
        @param if_modified_time: the if-modified-since header from the request
            (optional, defaults to not checking the if-modified-time)
        @rtype: (C{int}, C{string}, C{dictionary}, C{string})
        @return: the HTTP status code, status message, headers, and package data
        
        """
        
        file = self.get_filename(path)
            
        # Check if the file isn't in the cache
        if not exists(file):
            logger.info('cache miss: '+file)
            return (404, 'Not Found', {'Server': VERSION, 'Content-Type': 'text/plain', 'Pragma': 'no-cache'}, alas)
        
        if uptodate:
            # Get the last modified time from the server
            connection = HTTPConnection(path[0])
            connection.request('HEAD', '/' + '/'.join(path[1:]), None, {'User-Agent': VERSION})
            r = connection.getresponse()
            last_modified = r.getheader('last-modified')
            connection.close()

            # Check if the cached data is stale
            if self.check_mtime(last_modified, file = file) > 0:
                logger.info('cache out of date: '+file)
                return (405, 'Method Not Allowed', {'Server': VERSION, 'Content-Type': 'text/plain', 'Pragma': 'no-cache'}, alas)

        # Check if the request needs the data
        if if_modified_time and self.check_mtime(if_modified_time, file = file) >= 0:
            logger.info('cache up to date and so is request: '+file)
            return (304, 'Not Modified', {'Server': VERSION, 'Pragma': 'no-cache'}, '')
            
        # Read in the file and return the data
        f = open(file, 'rb')
        data = f.read()
        f.close()
        mtime_string = strftime(time_format+' GMT', gmtime(getmtime(file)))
        
        logger.info('cache hit: '+file)
        return (200, 'OK', {'Server': VERSION, 'Content-Type': 'text/plain', 'Last-Modified': mtime_string}, data)

    def get_filename(self, path):
        """Get the file name used for this path in the cache.
        
        @type path: C{list} of C{string}
        @param path: the server and path to download
        @rtype: C{string}
        @return: the file name
        
        """
        
        if not path:
            return None
        
        # Build the file name
        file = join(self.cachedir, path[0])
        for i in path[1:]:
            file = join(file, i)

        return file

    def get_file_mtime(self, path):
        """Get the modification time of the file in the cache.
        
        @type path: C{list} of C{string}
        @param path: the server and path to check
        @rtype: C{int}
        @return: the file's modification time, or 0 if the file is not found
        
        """
        
        if not path:
            return 0
        
        # Build the file name
        file = self.get_filename(path)

        try:
            return int(getmtime(file))
        except:
            return 0

    def check_mtime(self, http_mtime_string, path = [], file = '', server_mtime_string = ''):
        """Check the modified time of a file in the cache against a server header string.
        
        @type http_mtime_string: C{string}
        @param http_mtime_string: the modified time from an HTTP header
        @type path: C{list} of C{string}
        @param path: the server and path to download
            (optional, but one of file/path/server_mtime must be specified)
        @type file: C{string}
        @param file: the file name in the cache 
            (optional, but one of file/path/server_mtime must be specified)
        @type server_mtime_string: C{string}
        @param server_mtime_string: the last-modified time from the server's copy
            (optional, but one of file/path/server_mtime must be specified)
        @rtype: C{int}
        @return: the number of seconds the header's mtime is ahead of the 
            file's mtime (or None if the file isn't in the cache)
        
        """
        
        assert path or file or server_mtime_string

        if path:
            file = self.get_filename(path)
            
        if file:
            # Return None if the file isn't in the cache
            if not exists(file):
                return None
            
            # Check the server's time against the cached copy
            server_mtime = getmtime(file)
        else:
            server_mtime = timegm(strptime(server_mtime_string, time_format+' %Z'))

        http_mtime = timegm(strptime(http_mtime_string, time_format+' %Z'))

        return http_mtime - server_mtime
