Source code for warcat.util

'''Utility functions'''
# Copyright 2013 Christopher Foo <>
# Licensed under GPLv3. See COPYING.txt for details.
import collections
import datetime
import email.utils
import hashlib
import http.client
import io
import logging
import os
import tempfile
import threading
import urllib.parse

_logger = logging.getLogger(__name__)

[docs]def printable_str_to_str(s): return s.translate(str.maketrans('', '', '\t\r\n'))\ .replace(r'\r', '\r')\ .replace(r'\n', '\n')\ .replace(r'\t', '\t')
[docs]def find_file_pattern(file_obj, pattern, bufsize=512, limit=4096, inclusive=False): '''Find the offset from current position of pattern''' original_position = file_obj.tell() bytes_read = 0 # FIXME: don't accumulate growing buffer search_buf = io.BytesIO() while True: if limit: size = min(bufsize, limit - bytes_read) else: size = bufsize data = if not data: break search_buf.write(data) try: index = search_buf.getvalue().index(pattern) except ValueError: pass else: offset = index if inclusive: offset += len(pattern) return offset bytes_read += len(data) raise ValueError('Search for pattern exhausted')
[docs]def strip_warc_extension(s): '''Removes ``.warc`` or ``.warc.gz`` from filename''' if s.endswith('.gz'): s = s[:-3] if s.endswith('.warc'): s = s[:-5] return s
[docs]class DiskBufferedReader(io.BufferedIOBase): '''Buffers the file to disk large parts at a time''' # Some segments lifted from # Copyright 2001-2011 Python Software Foundation # Licensed under Python Software Foundation License Version 2 def __init__(self, raw, disk_buffer_size=104857600, spool_size=10485760): io.BufferedIOBase.__init__(self) self._raw = raw self._disk_buffer_size = disk_buffer_size self._offset = 0 self._block_index = None self._block_file = None self._spool_size = spool_size self._lock = threading.RLock() self._cache = FileCache() self._set_block(0) def _set_block(self, index): if index == self._block_index: return with self._lock: self._block_index = index self._block_file = self._cache.get(self._block_index) if self._block_file: _logger.debug('Buffer block file cache hit. index=%d', self._block_index) else: _logger.debug('Creating buffer block file. index=%d', self._block_index) self._block_file = tempfile.SpooledTemporaryFile( max_size=self._spool_size) * self._disk_buffer_size) copyfile_obj(self.raw, self._block_file, max_length=self._disk_buffer_size) self._cache.put(self._block_index, self._block_file) _logger.debug('Buffer block file created. length=%d', self._block_file.tell())
[docs] def tell(self): return self._offset
[docs] def seek(self, pos, whence=0): if not (0 <= whence <= 1): raise ValueError('Bad whence argument') with self._lock: if whence == 1: self._offset += pos self._offset = pos index = self._offset // self._disk_buffer_size self._set_block(index) % self._disk_buffer_size)
[docs] def read(self, n=None): buf = io.BytesIO() bytes_left = n with self._lock: while True: data = self._offset += len(data) buf.write(data) bytes_left -= len(data) if not data: break if bytes_left <= 0: break return buf.getvalue()
[docs] def peek(self, n=0): with self._lock: original_position = self.tell() data = return data
[docs] def seekable(self): return self.raw.seekable()
[docs] def readable(self): return self.raw.readable()
[docs] def writable(self): return False
@property def raw(self): return self._raw @property def closed(self): return self.raw.closed @property def name(self): return @property def mode(self): return self.raw.mode
[docs] def fileno(self): return self.raw.fileno()
[docs] def isatty(self): return self.raw.isatty()
[docs]class FileCache(object): '''A cache containing references to file objects. File objects are closed when expired. Class is thread safe and will only return file objects belonging to its own thread. ''' def __init__(self, size=4): self._size = size self._files = collections.deque() self._lock = threading.Lock()
[docs] def get(self, filename): thread_id = threading.current_thread() with self._lock: return self._get(filename, thread_id)
def _get(self, filename, thread_id): for cache_filename, cache_thread_id, file_obj in self._files: if filename == cache_filename and thread_id == cache_thread_id: return file_obj
[docs] def put(self, filename, file_obj): thread_id = threading.current_thread() with self._lock: if self._get(filename, thread_id): return if len(self._files) > self._size: old_file_obj = self._files.popleft()[2] old_file_obj.close() self._files.append((filename, thread_id, file_obj))
[docs]def copyfile_obj(source, dest, bufsize=4096, max_length=None, write_attr_name='write'): '''Like :func:`shutil.copyfileobj` but with limit on how much to copy''' bytes_read = 0 write_func = getattr(dest, write_attr_name) while True: if max_length != None: read_size = min(bufsize, max_length - bytes_read) else: read_size = bufsize data = if not data: break write_func(data) bytes_read += len(data)
[docs]class HTTPSocketShim(io.BytesIO):
[docs] def makefile(self, *args, **kwargs): return self
[docs]def parse_http_response(file_obj): '''Parse and return :class:`http.client.HTTPResponse`''' response = http.client.HTTPResponse(HTTPSocketShim(file_obj)) response.begin() return response
[docs]def split_url_to_filename(s): '''Attempt to split a URL to a filename on disk''' url_info = urllib.parse.urlsplit(s) l = [sanitize_str(url_info.netloc)] for part in url_info.path.lstrip('/').split('/'): part = sanitize_str(part) l.append(part) if not part: l[-1] = append_index_filename(part) if url_info.query: l[-1] += '_' + sanitize_str(url_info.query) if frozenset([os.curdir, os.pardir, '.', '..']) & frozenset(l): raise ValueError('Path contains directory traversal filenames') return l
SANITIZE_BLACKLIST = frozenset( r'/\:*?"<>|' + ''.join([chr(i) for i in range(0, 32)]) + '\x7f' )
[docs]def sanitize_str(s): '''Replaces unsavory chracters from string with an underscore''' return ''.join([c if c not in SANITIZE_BLACKLIST else '_' for c in s])
[docs]def append_index_filename(path): '''Adds ``_index_xxxxxx`` to the path. It uses the basename aka filename of the path to generate the hex hash digest suffix. ''' hasher = hashlib.sha1(os.path.basename(path).encode()) path += '_index_{}'.format(hasher.hexdigest()[:6]) return path
[docs]def rename_filename_dirs(dest_filename): '''Renames files if they conflict with a directory in given path. If a file has the same name as the directory, the file is renamed using :func:`append_index_filename`. ''' path = dest_filename while True: path, filename = os.path.split(path) if not filename: break if os.path.isfile(path): new_path = append_index_filename(path) _logger.debug('Rename %s -> %s', path, new_path) os.rename(path, new_path) break
[docs]def truncate_filename_parts(path_parts, length=160): '''Truncate and suffix filename path parts if they exceed the given length. If the filename part is too long, the part is truncated and an underscore plus a 6 letter hex (_xxxxxx) suffix is appended. ''' new_parts = list(path_parts) for index in range(len(path_parts)): part = path_parts[index] if len(part) > length: hasher = hashlib.sha1(part.encode()) new_part = '{}_{}'.format(part[:length], hasher.hexdigest()[:6]) new_parts[index] = new_part return new_parts
[docs]def parse_http_date(s): t = email.utils.parsedate_tz(s) if not t: raise ValueError('Unable to parse date') tzinfo = datetime.timezone(datetime.timedelta(seconds=t[9])) d = datetime.datetime(*t[:6], tzinfo=tzinfo) return d
file_cache = FileCache() '''The :class:`FileCache` instance'''