Implemented keeping track of file logical sizes. (#41)

kodek16 · accek · commit f69d4267b17b · 2018-05-21T14:24:29.000+02:00
* Implemented keeping track of file logical sizes.

This makes Client.file_size() consistent independently of whether
file is cached.
diff --git a/filetracker/client/remote_data_store.py b/filetracker/client/remote_data_store.py
@@ -4,9 +4,10 @@
 import functools
 import gzip
 import logging
+import os
 import shutil
-import time
 import tempfile
+import time
 
 import requests
 from six.moves.urllib.request import pathname2url
@@ -116,7 +117,8 @@ def add_file(self, name, filename, compress_hint=True):
                     with gzip.GzipFile(fileobj=tmp, mode='wb') as gz:
                         shutil.copyfileobj(f, gz)
                     tmp.seek(0)
-                    headers.update({'Content-Encoding': 'gzip'})
+                    headers['Content-Encoding'] = 'gzip'
+                    headers['Logical-Size'] = str(os.stat(filename).st_size)
                     response = self._put_file(url, version, tmp, headers)
             else:
                 response = self._put_file(url, version, f, headers)
@@ -165,7 +167,7 @@ def file_size(self, name):
         url, version = self._parse_name(name)
         response = requests.head(url, allow_redirects=True)
         response.raise_for_status()
-        return int(response.headers.get('content-length', 0))
+        return int(response.headers.get('logical-size', 0))
 
     @_verbose_http_errors
     def delete_file(self, filename):
diff --git a/filetracker/interaction_test.py b/filetracker/interaction_test.py
@@ -125,6 +125,16 @@ def test_file_version_should_be_set_to_current_time_on_upload(self):
         self.assertNotEqual(version, 1)
         self.assertTrue(pre_upload <= version <= post_upload)
 
+    def test_file_size_should_return_decompressed_size_without_cache(self):
+        src_file = os.path.join(self.temp_dir, 'size.txt')
+        with open(src_file, 'wb') as sf:
+            sf.write(b'hello size')     # size = 10
+
+        self.client.put_file('/size.txt', src_file, to_local_store=False)
+
+        self.assertEqual(
+                self.client.file_size('/size.txt'), len(b'hello size'))
+
     def test_every_link_should_have_independent_version(self):
         src_file = os.path.join(self.temp_dir, 'foo.txt')
         with open(src_file, 'wb') as sf:
diff --git a/filetracker/scripts/recover.py b/filetracker/scripts/recover.py
@@ -96,6 +96,7 @@ def main():
     processed_blobs = 0
     broken_blobs = 0
 
+    # TODO this script should be updated to recalculate file logical sizes.
     with progress_bar.conditional(show=not silent,
                                   widgets=blobs_widgets) as bar:
         for cur_dir, _, files in os.walk(file_storage.blobs_dir):
diff --git a/filetracker/servers/files.py b/filetracker/servers/files.py
@@ -52,27 +52,31 @@ def handle_PUT(self, environ, start_response):
         compressed = environ.get('HTTP_CONTENT_ENCODING', None) == 'gzip'
 
         digest = environ.get('HTTP_SHA256_CHECKSUM', None)
+        logical_size = environ.get('HTTP_LOGICAL_SIZE', None)
 
         version = self.storage.store(name=path,
                                      data=environ['wsgi.input'],
                                      version=last_modified,
                                      size=content_length,
                                      compressed=compressed,
-                                     digest=digest)
+                                     digest=digest,
+                                     logical_size=logical_size)
         start_response('200 OK', [
             ('Content-Type', 'text/plain'),
             ('Last-Modified', email.utils.formatdate(version)),
         ])
         return []
 
-    def _file_headers(self, path):
-        link_st = os.lstat(path)
-        blob_st = os.stat(path)
+    def _file_headers(self, name):
+        link_st = os.lstat(os.path.join(self.dir, name))
+        blob_st = os.stat(os.path.join(self.dir, name))
+        logical_size = self.storage.logical_size(name)
         return [
-                ('Last-Modified', email.utils.formatdate(link_st.st_mtime)),
                 ('Content-Type', 'application/octet-stream'),
                 ('Content-Length', str(blob_st.st_size)),
-                ('Content-Encoding', 'gzip')
+                ('Content-Encoding', 'gzip'),
+                ('Last-Modified', email.utils.formatdate(link_st.st_mtime)),
+                ('Logical-Size', str(logical_size)),
             ]
 
     def handle_GET(self, environ, start_response):
diff --git a/filetracker/servers/storage.py b/filetracker/servers/storage.py
@@ -79,7 +79,8 @@ def __del__(self):
         self.db.close()
         self.db_env.close()
 
-    def store(self, name, data, version, size=0, compressed=False, digest=None):
+    def store(self, name, data, version, size=0,
+              compressed=False, digest=None, logical_size=None):
         """Adds a new file to the storage.
         
         If the file with the same name existed before, it's not
@@ -108,62 +109,53 @@ def store(self, name, data, version, size=0, compressed=False, digest=None):
             digest: SHA256 digest of the file before compression
                 If specified, the digest will not be computed again, saving
                 resources.
+            logical_size: if ``data`` is gzip-compressed, this parameter
+                has to be set to decompressed file size.
         """
         with _exclusive_lock(self._lock_path('links', name)):
             link_path = self._link_path(name)
             if _path_exists(link_path) and _file_version(link_path) > version:
                 return _file_version(link_path)
 
-            # Path to temporary file that may be created in some cases.
-            temp_file_path = None
-
-            if digest is None:
-                # Write data to temp file and calculate hash.
-                temp_file_fd, temp_file_path = tempfile.mkstemp()
-                temp_file = os.fdopen(temp_file_fd, 'wb')
-                _copy_stream(data, temp_file, size)
-                temp_file.close()
-
-                if compressed:
-                    # If data was already compressed, we have to decompress it
-                    # before calculating the digest.
-                    with gzip.open(temp_file_path, 'rb') as compressed_file:
-                        digest = file_digest(compressed_file)
-                else:
-                    digest = file_digest(temp_file_path)
+            # data is managed by contents now, and shouldn't be used directly
+            with _InputStreamWrapper(data, size) as contents:
+                if digest is None or logical_size is None:
+                    contents.save()
+                    if compressed:
+                        # This shouldn't occur if the request came from a proper
+                        # filetracker client, so we don't care if it's slow.
+                        with gzip.open(
+                                contents.current_path, 'rb') as decompressed:
+                            digest = file_digest(decompressed)
+                        with gzip.open(
+                                contents.current_path, 'rb') as decompressed:
+                            logical_size = _read_stream_for_size(decompressed)
+                    else:
+                        digest = file_digest(contents.current_path)
+                        logical_size = os.stat(contents.current_path).st_size
+
+                blob_path = self._blob_path(digest)
+                
+                with self._lock_blob_with_txn(digest) as txn:
+                    digest_bytes = digest.encode()
 
-            blob_path = self._blob_path(digest)
-            
-            with self._lock_blob_with_txn(digest) as txn:
-                digest_bytes = digest.encode('utf8')
-                try:
                     link_count = int(self.db.get(digest_bytes, 0, txn=txn))
-                except KeyError:
-                    link_count = 0
+                    new_count = str(link_count + 1).encode()
+                    self.db.put(digest_bytes, new_count, txn=txn)
 
-                new_count = str(link_count + 1).encode('utf8')
-                self.db.put(digest_bytes, new_count, txn=txn)
+                    # Create a new blob if this isn't a duplicate.
+                    if link_count == 0:
+                        _create_file_dirs(blob_path)
+                        self.db.put('{}:logical_size'.format(digest).encode(),
+                                    str(logical_size).encode())
 
-                if link_count == 0:
-                    # Create a new blob.
-                    _create_file_dirs(blob_path)
-                    if compressed:
-                        if temp_file_path:
-                            shutil.move(temp_file_path, blob_path)
+                        if compressed:
+                            contents.save(blob_path)
                         else:
-                            with open(blob_path, 'wb') as blob:
-                                _copy_stream(data, blob, size)
-                    else:
-                        if temp_file_path:
-                            with open(temp_file_path, 'rb') as raw,\
+                            contents.save()
+                            with open(contents.current_path, 'rb') as raw,\
                                     gzip.open(blob_path, 'wb') as blob:
                                 shutil.copyfileobj(raw, blob)
-                        else:
-                            with gzip.open(blob_path, 'wb') as blob:
-                                _copy_stream(data, blob, size)
-
-                if temp_file_path and os.path.exists(temp_file_path):
-                    os.unlink(temp_file_path)
 
             if _path_exists(link_path):
                 # Lend the link lock to delete().
@@ -206,29 +198,34 @@ def delete(self, name, version, _lock=True):
             digest = self._digest_for_link(name)
             with self._lock_blob_with_txn(digest) as txn:
                 os.unlink(link_path)
-                digest_bytes = digest.encode('utf8')
+                digest_bytes = digest.encode()
                 link_count = self.db.get(digest_bytes, txn=txn)
                 if link_count is None:
                     raise RuntimeError("File exists but has no key in db")
                 link_count = int(link_count)
                 if link_count == 1:
                     self.db.delete(digest_bytes, txn=txn)
+                    self.db.delete(
+                            '{}:logical_size'.format(digest).encode(), txn=txn)
                     os.unlink(self._blob_path(digest))
                 else:
-                    new_count = str(link_count - 1).encode('utf8')
+                    new_count = str(link_count - 1).encode()
                     self.db.put(digest_bytes, new_count, txn=txn)
         return True
 
     def stored_version(self, name):
-        """
-        Returns the version of file `name` that is currently stored
-        or None if it doesn't exist.
-        """
+        """Returns the version of file `name` or None if it doesn't exist."""
         link_path = self._link_path(name)
         if not _path_exists(link_path):
             return None
         return _file_version(link_path)
 
+    def logical_size(self, name):
+        """Returns the logical size (before compression) of file `name`."""
+        digest = self._digest_for_link(name)
+        return int(self.db.get('{}:logical_size'
+            .format(digest).encode()).decode())
+
     def _link_path(self, name):
         return os.path.join(self.links_dir, name)
 
@@ -261,6 +258,44 @@ def _digest_for_link(self, name):
         return digest
 
 
+class _InputStreamWrapper(object):
+    """A wrapper for lazy reading and moving contents of 'wsgi.input'.
+    
+    Should be used as a context manager.
+    """
+    def __init__(self, data, size):
+        self._data = data
+        self._size = size
+        self.current_path = None
+        self.saved_in_temp = False
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, _exc_type, _exc_value, _traceback):
+        """Removes file if it was last saved as a temporary file."""
+        if self.saved_in_temp:
+            os.unlink(self.current_path)
+
+    def save(self, new_path=None):
+        """Moves or creates the file with stream contents to a new location.
+
+        Args:
+            new_path: path to move to, if None a temporary file is created.
+        """
+        self.saved_in_temp = new_path is None
+        if new_path is None:
+            fd, new_path = tempfile.mkstemp()
+            os.close(fd)
+
+        if self.current_path:
+            shutil.move(self.current_path, new_path)
+        else:
+            with open(new_path, 'wb') as dest:
+                _copy_stream(self._data, dest, self._size)
+        self.current_path = new_path
+
+
 _BUFFER_SIZE = 64 * 1024
 
 
@@ -292,6 +327,17 @@ def _copy_stream(src, dest, length=0):
         bytes_left -= buf_size
 
 
+def _read_stream_for_size(stream):
+    """Reads a stream discarding the data read and returns its size."""
+    size = 0
+    while True:
+        buf = stream.read(_BUFFER_SIZE)
+        size += len(buf)
+        if not buf:
+            break
+    return size
+
+
 def _create_file_dirs(file_path):
     """Creates directory tree to file if it doesn't exist."""
     dir_name = os.path.dirname(file_path)