Source code for warcat.model.warc

'''WARC model starting point'''
# Copyright 2013 Christopher Foo <chris.foo@gmail.com>
# Licensed under GPLv3. See COPYING.txt for details.
from warcat import util
from warcat.model.binary import BytesSerializable
from warcat.model.common import FIELD_DELIM_BYTES
from warcat.model.record import Record
import gzip
import logging


_logger = logging.getLogger(__name__)


[docs]class WARC(BytesSerializable):
    '''A Web ARChive file model.

    Typically, large streaming operations should use :func:`open` and
    :func:`read_record` functions.
    '''

    def __init__(self):
        self.records = []

[docs]    def load(self, filename):
        '''Open and load the contents of the given filename.

        The records are located in :attr:`records`.
        '''

        f = self.open(filename)
        self.read_file_object(f)
        f.close()

[docs]    def read_file_object(self, file_object):
        '''Read records until the file object is exhausted'''

        while True:
            record, has_more = self.read_record(file_object)
            self.records.append(record)
            if not has_more:
                break

    @classmethod
[docs]    def open(cls, filename, force_gzip=False):
        '''Return a logical file object.

        :param filename: The path of the file. gzip compression is detected
            using file extension.
        :param force_gzip: Use gzip compression always.
        '''

        if filename.endswith('.gz') or force_gzip:
            f = gzip.open(filename)
            _logger.info('Opened gziped file %s', filename)
            return util.DiskBufferedReader(f)
        else:
            f = open(filename, 'rb')
            _logger.info('Opened file %s', filename)
            return f

    @classmethod
[docs]    def read_record(cls, file_object, preserve_block=False,
    check_block_length=True):
        '''Return a record and whether there are more records to read.

        .. seealso:: :class:`Record`

        :return: A tuple. The first item is the :class:`Record`. The second
            item is a boolean indicating whether there are more records to
            be read.
        '''

        record = Record.load(file_object, preserve_block=preserve_block,
            check_block_length=check_block_length)
        _logger.debug('Finished reading a record %s', record.record_id)

        data = file_object.read(len(FIELD_DELIM_BYTES))

        if data != FIELD_DELIM_BYTES:
            _logger.debug('Wrong delim %s', data)
            raise IOError('Blocks not separated correctly (tell={})'.format(
                file_object.tell()))

        if not file_object.peek(1):
            _logger.info('Finished reading Warc')
            return (record, False)
        else:
            return (record, True)

[docs]    def iter_bytes(self):
        for record in self.records:
            for v in record.iter_bytes():
                yield v


__all__ = ['WARC']