Source code for warcat.model.binary

'''Model serialization and binary references'''
# Copyright 2013 Christopher Foo <chris.foo@gmail.com>
# Licensed under GPLv3. See COPYING.txt for details.
from warcat import util
import abc
import gzip
import logging
import tempfile


_logger = logging.getLogger(__name__)


[docs]class BytesSerializable(metaclass=abc.ABCMeta):
    '''Metaclass that indicates this object can be serialized to bytes'''

    @abc.abstractmethod
[docs]    def iter_bytes(self):
        '''Return an iterable of bytes'''
        pass

    def __bytes__(self):
        return b''.join(self.iter_bytes())


[docs]class StrSerializable(metaclass=abc.ABCMeta):
    '''Metaclass that indicates this object can be serialized to str'''

    @abc.abstractmethod
[docs]    def iter_str(self):
        '''Return an iterable of str'''
        pass

    def __str__(self):
        return ''.join(self.iter_str())


[docs]class BinaryFileRef(metaclass=abc.ABCMeta):
    '''Reference to a file containing the content block data.

    .. attribute:: file_offset

        When reading, the file is seeked to `file_offset`.

    .. attribute:: length

        The length of the data

    .. attribute:: filename

        The filename of the referenced data. It must be a valid file.

    .. attribute:: file_obj

        The file object to be read from. It is important that this file
        object is not shared or race conditions will occur. File objects
        are not closed automatically.

    .. note::

        Either :attr:`filename` or :attr:`file_obj` must be set.
    '''

    def __init__(self):
        self.file_offset = 0
        self.length = None
        self.filename = None
        self.file_obj = None

[docs]    def set_file(self, file, offset=0, length=None):
        '''Set the reference to the file or filename with the data.

        This is a convenience function to setting the attributes individually.
        '''

        assert file

        if hasattr(file, 'read'):
            self.file_obj = file
        else:
            self.filename = file

        self.file_offset = offset
        self.length = length

[docs]    def iter_file(self, buffer_size=4096):
        '''Return an iterable of bytes of the source data'''

        with self.get_file(safe=True) as file_obj:
            bytes_read = 0

            while True:
                if self.length is not None:
                    length = min(buffer_size, self.length - bytes_read)
                else:
                    length = buffer_size

                data = file_obj.read(length)
                bytes_read += len(data)

                if not data or not length:
                    break

                yield data

[docs]    def get_file(self, safe=True, spool_size=10485760):
        '''Return a file object with the data.

        :param safe:
            If `True`, return a new file object that is a copy of the data.
            You will be responsible for closing the file.

            Otherwise, it will be the original file object that is seeked
            to the correct offset. Be sure to not read beyond its length and
            seek back to the original position if necessary.
        '''

        if self.filename:
            file_obj = util.file_cache.get(self.filename)

            if not file_obj:
                if self.filename.endswith('.gz'):
                    file_obj = util.DiskBufferedReader(
                        gzip.GzipFile(self.filename))
                else:
                    file_obj = open(self.filename, 'rb')

                util.file_cache.put(self.filename, file_obj)
        else:
            file_obj = self.file_obj

        original_position = file_obj.tell()

        if self.file_offset:
            file_obj.seek(self.file_offset)

        if safe:
            _logger.debug('Creating safe file of %s',
                self.filename or self.file_obj)
            temp_file_obj = tempfile.SpooledTemporaryFile(max_size=spool_size)

            util.copyfile_obj(file_obj, temp_file_obj, max_length=self.length)
            temp_file_obj.seek(0)
            file_obj.seek(original_position)

            return temp_file_obj

        return file_obj


__all__ = ['BytesSerializable', 'StrSerializable', 'BinaryFileRef']