fcvarela · mjvm · Jun 2, 2014
diff --git a/pyremotezip/remotezip.py b/pyremotezip/remotezip.py
@@ -1,10 +1,15 @@
 # -*- coding: utf-8 -*-
 
-import urllib2
 import zlib
+import struct
 
-from urllib2 import HTTPError
-from struct import unpack
+from urllib.request import Request, urlopen
+from urllib.error import HTTPError
+
+import logging
+
+
+logger = logging.getLogger('pyremotezip')
 
 
 class RemoteZip(object):
@@ -19,19 +24,36 @@ def __init__(self, zipURI):
         self.filesize = None
         self.zipURI = zipURI
         self.tableOfContents = None
+        self.fileList = None
 
     def __file_exists(self):
         # check if file exists
-        headRequest = urllib2.Request(self.zipURI)
+        headRequest = Request(self.zipURI)
         headRequest.get_method = lambda: 'HEAD'
         try:
-            response = urllib2.urlopen(headRequest)
-            self.filesize = int(response.info().getheader('Content-Length'))
+            response = urlopen(headRequest)
+            self.filesize = int(response.getheader('Content-Length'))
             return True
         except HTTPError as e:
-            print '%s' % e
+            logger.error('Unable to retrieve remote zip: %r', e)
             return False
 
+    def __request_range(self, start, offset):
+        """ This function makes a range http request
+        """
+
+        request = Request(self.zipURI)
+        request.headers['Range'] = 'bytes=%s-%s' % (start, offset, )
+        handle = urlopen(request)
+
+        # make sure the response is ranged
+        return_range = handle.getheader('Content-Range')
+        if return_range != "bytes %d-%d/%s" % (start, offset, self.filesize, ):
+            raise Exception("Ranged requests are not supported for this URI")
+        # got here? we're fine, read the contents
+        raw_bytes = handle.read()
+        return raw_bytes
+
     def getTableOfContents(self):
         """
         This function populates the internal tableOfContents list with the contents
@@ -43,105 +65,90 @@ def getTableOfContents(self):
             raise FileNotFoundException()
 
         # now request bytes from that size minus a 64kb max zip directory length
-        request = urllib2.Request(self.zipURI)
-        start = self.filesize - (65536)
+        if self.filesize < 64 * 2 ** 10:
+            start = 0  # if the file is small than 64kb get everything
+        else:
+            start = self.filesize - (64 * 2 ** 10)
         end = self.filesize - 1
-        request.headers['Range'] = "bytes=%s-%s" % (start, end)
-        handle = urllib2.urlopen(request)
-
-        # make sure the response is ranged
-        return_range = handle.headers.get('Content-Range')
-        if return_range != "bytes %d-%d/%s" % (start, end, self.filesize):
-            raise Exception("Ranged requests are not supported for this URI")
-
-        # got here? we're fine, read the contents
-        raw_bytes = handle.read()
-
-        # now find the end-of-directory: 06054b50
-        # we're on little endian maybe
-        directory_end = raw_bytes.find("\x50\x4b\x05\x06")
+        raw_bytes = self.__request_range(start, end)
+
+        directory_size, directory_start = self.__read_central_directory_size_and_offset(raw_bytes)
+        table_of_contents, file_list = self.__read_central_directory(raw_bytes[directory_start:directory_start + directory_size])
+        self.tableOfContents = table_of_contents
+        self.fileList = file_list
+        return self.tableOfContents
+
+    def __read_central_directory_size_and_offset(self, raw_bytes):
+        ''' reads the end of central directory structure
+        '''
+        # find the end-of-directory: 06054b50
+        directory_end = raw_bytes.find(b"\x50\x4b\x05\x06")
         if directory_end < 0:
-            raise Exception("Could not find end of directory")
-
-        # now find the size of the directory: offset 12, 4 bytes
-        # directory_size = unpack("i", raw_bytes[directory_end+12:directory_end+16])[0]
-
-        # and find the offset from start of file where it can be found
-        directory_start = unpack("i", raw_bytes[directory_end + 16: directory_end + 20])[0]
-
-        # find the data in the raw_bytes
-        current_start = directory_start - start
-        filestart = 0
-        compressedsize = 0
-        tableOfContents = []
-
-        try:
-            while True:
-                # get file name size (n), extra len (m) and comm len (k)
-                zip_n = unpack("H", raw_bytes[current_start + 28: current_start + 28 + 2])[0]
-                zip_m = unpack("H", raw_bytes[current_start + 30: current_start + 30 + 2])[0]
-                zip_k = unpack("H", raw_bytes[current_start + 32: current_start + 32 + 2])[0]
-
-                filename = raw_bytes[current_start + 46: current_start + 46 + zip_n]
-
-                # check if this is the index file
-                filestart = unpack("I", raw_bytes[current_start + 42: current_start + 42 + 4])[0]
-                compressedsize = unpack("I", raw_bytes[current_start + 20: current_start + 20 + 4])[0]
-                uncompressedsize = unpack("I", raw_bytes[current_start + 24: current_start + 24 + 4])[0]
-                tableItem = {
-                    'filename': filename,
-                    'compressedsize': compressedsize,
-                    'uncompressedsize': uncompressedsize,
-                    'filestart': filestart
-                }
-                tableOfContents.append(tableItem)
-
-                # not this file, move along
-                current_start = current_start + 46 + zip_n + zip_m + zip_k
-        except:
-            pass
-
-        self.tableOfContents = tableOfContents
-        return tableOfContents
+            raise Exception('Could not find EOCD')
+        eocd = raw_bytes[directory_end:]
+        cd_size, cd_offset = struct.unpack('ii', eocd[12:20])
+        return cd_size, cd_offset
+
+    def __read_central_directory(self, raw_bytes):
+        ''' reads the zip file central directory structure
+        '''
+        table_of_contents = {}
+        file_list = []
+        while True:
+            try:
+                comp_size, uncomp_size = struct.unpack('II', raw_bytes[20:20 + 8])
+            except struct.error:
+                break
+            n, m, k = struct.unpack('HHH', raw_bytes[28:28 + 6])
+            offset, = struct.unpack('I', raw_bytes[42:42 + 4])
+            filename = raw_bytes[46:46 + n]
+            table_of_contents[filename] = {'filename': filename,
+                                           'compressedsize': comp_size,
+                                           'uncompressedsize': uncomp_size,
+                                           'filestart': offset}
+            file_list.append(filename)
+            raw_bytes = raw_bytes[46 + n + m + k:]
+        return table_of_contents, file_list
+
+    def __read_data_descriptor(self, local_file_header, compressed_file_start):
+        start_compressed_data = local_file_header[compressed_file_start:]
+        start_data_descriptor = start_compressed_data.find(b'\x50\x4b\x07\x08')
+        if start_data_descriptor > 0:
+            crc, comp_size, uncomp_size = struct.unpack('III',
+                    local_file_header[start_data_descriptor + 4:start_data_descriptor + 16])
+            return crc, comp_size, uncomp_size
+
+    def __read_local_file_header(self, raw_bytes):
+        zip_n, zip_m = struct.unpack("HH", raw_bytes[26:30])
+        header_size = 30 + zip_n + zip_m
+        return header_size
 
     def extractFile(self, filename):
         """
         This function will extract a single file from the remote zip without downloading
         the entire zip file. The filename argument should match whatever is in the 'filename'
         key of the tableOfContents.
         """
-        files = [x for x in self.tableOfContents if x['filename'] == filename]
-        if len(files) == 0:
-            raise FileNotFoundException()
+        if filename not in self.tableOfContents:
+            raise FileNotFoundException('Requested file not found: %r' % (filename))
 
-        fileRecord = files[0]
+        file_offset = self.tableOfContents[filename]['filestart']
+        file_compressed_size = self.tableOfContents[filename]['compressedsize']
+        file_uncompressed_size = self.tableOfContents[filename]['uncompressedsize']
 
-        # got here? need to fetch the file size
-        metaheadroom = 1024  # should be enough
-        request = urllib2.Request(self.zipURI)
-        end = fileRecord['filestart'] + fileRecord['compressedsize'] + metaheadroom
-        request.headers['Range'] = "bytes=%s-%s" % (fileRecord['filestart'], end)
-        handle = urllib2.urlopen(request)
-        filedata = handle.read()
+        local_file_header = self.__request_range(file_offset, file_offset + 30)
+        local_file_header_size = self.__read_local_file_header(local_file_header)
+        compressed_data = self.__request_range(file_offset + local_file_header_size, file_offset + local_file_header_size + file_compressed_size - 1)
 
-        # find start of raw file data
-        zip_n = unpack("H", filedata[26:28])[0]
-        zip_m = unpack("H", filedata[28:30])[0]
-
-        # check compressed size
-        comp_size = unpack("I", filedata[18:22])[0]
-        if comp_size != fileRecord['compressedsize']:
-            raise Exception("Something went wrong. Directory and file header disagree of compressed file size")
-
-        raw_zip_data = filedata[30 + zip_n + zip_m: 30 + zip_n + zip_m + comp_size]
-        uncompressed_data = ""
+        if file_compressed_size == file_uncompressed_size:
+            return compressed_data
 
+        uncompressed_data = b''
         dec = zlib.decompressobj(-zlib.MAX_WBITS)
-        for chunk in raw_zip_data:
-            rv = dec.decompress(chunk)
+        for chunk in compressed_data:
+            rv = dec.decompress(bytes([chunk]))
             if rv:
                 uncompressed_data = uncompressed_data + rv
-
         return uncompressed_data
 
 

diff --git a/test_remotezip.py b/test_remotezip.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+
+from pyremotezip.remotezip import RemoteZip
+
+zip_file = b'PK\x03\x04\n\x00\x00\x00\x00\x00\xd7l\xc2D%n\x1c\xa0\x16\x00\x00\x00\x16\x00\x00\x00\t\x00\x1c\x00README.mdUT\t\x00\x03Vp\x8cS\xc4\xda\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00README FILE\n\ndonut2.c\nPK\x03\x04\n\x00\x00\x00\x00\x00\x9c\xa9\xc2D\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x1c\x00src/UT\t\x00\x03\xc7\xda\x8cS#\xdb\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00PK\x03\x04\x14\x00\x00\x00\x08\x00\xc9l\xc2D\x99\x91v\n\xf2\x03\x00\x00\xac\x06\x00\x00\x0c\x00\x1c\x00src/donut2.cUT\t\x00\x03:p\x8cS\xc7\xda\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00MUks\xa2H\x14\xfd\xce\xaf\xe8tb\xec\x86F\x1bDcd\x1b7V\xac\xdd\xa4fL\xed#&\x19d\\$j\xf0\x01\x8a\x9a@\x94\xf9\xed\xdb `\xb0\xca\xe2\xdc{\xcf\xed\xfbl\x86$$\x11\xf1\xc1\xf1!=\xddy\xb3\x83\x0c\x8dL\xa5\xa9Q+C\xfa\ny\xc4\xc1\xc2~\xe2\x07H\xf72\xa9,\xeb\xa1$\xe1\x0c9\x8c)\xb4\x1deHbMJB&\xf8\xb2\xd2\n\r\x96Ii\xbbI\x8d\xb0\x9d3\xceX\xf9W9G#3\x92B\x8b\tN\x8b&\xbf\xfc\xe4\xd8Ak\xb2\xc8c\x0c\x88O\xde\x8b\x13y\xb8\xe2\x82\x08\x19\x14\x03=\x89/CH_\x1b\x8c\xea\x19\xc2k\x86\xe0\r\xcc\x10|\xf9\x16\xfc\xf8\ts"\xfc\xf8\xd9\xee>\xe7\xcaT\xe2\xf9\xef\xde\xf8(\x81\xa3\x97~A|}\x88\xbfuO\xc4}\xf4\xb4\xf8\xc8\x94\xf0~1\xbc\xb7\x0f\xe6.\xb0>\xfd\xf7\xd5.\x11\xc3\x82\xe8Z\xe3Ct"\xda\xfe\xb0\x93\xfa\x87\xfefy7\x8d\x03\xcb\n\xac\xe5\xe1c\xfe\xe31\xde?\xc4\xa7X`\x18m,k \x84\x87\xc8\xe6\x0e\n\xf1fy8\xecw\xe3E|\x08\xf6\xd1\x9d\x93\xc0\xc86\xf7\xbb.L\xc2Z\xbf\x0c\x84\x8f\xc3\xf4\x0f?\xc8\x83\xeb\x07\xfd\'\xd7\x0f\xdc\xbb\xf8/\xdb\xd9\xef;w/\xe6\xe6y6\xdb\xf4\x9f,\xfb{\xc2\xf9w l\x13\xc1&\xcbz\xd3\x1f><\xce\xff~\\\xfc\xc3i\xfd\xe7\xd5p\xfd\xf0\xdd\xe8\xee;\xf6k\x07\x9a\xc3j\x03\x00K\x16\x1a\xaaa\x0c%)!\x94\x1a\x97J;0\xd7Vk\xc1\xff\xb0\xec\xeb\xc1x\xbb\x0b<\xb0\xd6\xe3.\xb2q\xda\x1a\x94\x17\xc0g d6\x89\x18%C\xde%\x85^\xd7\x8d\xa1\x8em\x06 \xf8\xadB\xfe+W1\x1axr\x12\xd8`0\xfc\x05\xcd\x9c\xe8\xf0\xce**\x81g%\xb1Z\xc6\xe75X\x14\x19Jr\x834!\x81\x03\x88*\x17\x10@\xaa\xa8Z\xfdTrpyu\r\x89\xd6\xc0\x92\xa2Y\x04\x80\x15:u\xe7\xfc\xa2tYF\x98\xb6\xb4\xda5\x80&p\x90Bs"\x01\xf0\x12\x89\xe7\xa4RUj\xdc\x1f \xb0\x8c\x0b\xa2T\x92/\xa8:\x80g\x00\x12\xa0i\xdc\xb7j\trM#6\xe6\xa3\x17/m\xd7Cs\x9e\xfb\xc2\xb7\xb7)\xe3\x86g\xdca\x80\x12\x97\xcc\xc8\xe7q\xc9ta\xb5\xdbn\xd2p\xe0 TF\xa6z\x0f\xb1\xaeg\xb3\x9b\xae\x9c\x0er\'c\xb6q=\x01\xdd`\x02<\xbeV\x1c\xa0\x0e&S\xe6\xf8\x1bt$p\xd5\x92k\x12A\x07\xeb\t}\xce\x04^c~\x94\xc1\xd5s}\xce\x976b\xb2B\xe5y5!4\x93e#>\xd3\x14\t\xcdKM*kE\xf2X\x04J\xa5V\x8d$\x8f\xf4\xd8\r\xa7\xd0\n\xadFdd\xce-\x06+\xe7\xd0\xf4\xa5\xde\xa5\x92\x94\xf33\x11Q=\'vQ\xe2WF\xae\xb7\xc5\xe8Z\xec\xe0\x92Z\xa7\xc7hf\xdc\xacQQ\x9b\xc6L\x9f\x81\xe4\xa6\xe0>\xafpNL7\x18\xb9\xb9\x8d\xab\xbb\xa9\x81\x9a\xe7\xef$\xf9\xa7\x99\xba\xbc\x04\xe0\x95\x15\x93\x91\x16`\x86\xc9$\xb5\xe0/o\xecUR\xc9-S\xeaU\xe4\x88o\xe2X\x9a\xa4\x96\xe2T\xaac\xb2\x10\xd2\x9a\xb9\xf9m\x02\xc8\x96%VSy"\x8e\xf5\x90iTR\xc5[\x11-\xb8\x0c\x80\xa5\xbc\x15=\x01\xf3\x89UT)#\xdcrW\xa9\xda\x93\xb6\xe2\x12\xf3\n\x86R\x93\x8a\x11\xe9\x01\xc0\x9a"B\x13Q\x18\xcb\x8e\xf8*N\x8br.\xf9\x7f*\x1a\xf3c\xa6\xf2\x82\xbfy\xc7\x08tw\x82n\x8dO\xc1\xf4-\xfci\xe6\xb7\xb2\xc5ny\xb5\x8f;\xc3K\x0e*_\xae)X!D\x96N\x13.\xb1\xf3\x8b\xdfs\xbd\xd93h\xbb\xd7\xca\x10\x1f2\xfe\xc4\x19Z\x05\xbc-\x93b\x19a\xc91\xff\xe4\xf3{|\xd4\xab\xb4M\x19\xe2\xb3\xa3\xf0\xd1\xc9\x90\xc8\x07@\xd2\x94\x9ch\x1cg)\xf7\xba\xdb~\xf9\x80\xa4\x93\xd4\x1e\xe5\xd1\xcc\xad\x96B\x8b&\'\xc1\xdcH\xc5W\xa1B\xeb\xb5|\xdcA\'\xedw\xed\x8b\xa5\x1e\xc7\xc2\xffPK\x01\x02\x1e\x03\n\x00\x00\x00\x00\x00\xd7l\xc2D%n\x1c\xa0\x16\x00\x00\x00\x16\x00\x00\x00\t\x00\x18\x00\x00\x00\x00\x00\x01\x00\x00\x00\xa4\x81\x00\x00\x00\x00README.mdUT\x05\x00\x03Vp\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00PK\x01\x02\x1e\x03\n\x00\x00\x00\x00\x00\x9c\xa9\xc2D\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x18\x00\x00\x00\x00\x00\x00\x00\x10\x00\xedAY\x00\x00\x00src/UT\x05\x00\x03\xc7\xda\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00PK\x01\x02\x1e\x03\x14\x00\x00\x00\x08\x00\xc9l\xc2D\x99\x91v\n\xf2\x03\x00\x00\xac\x06\x00\x00\x0c\x00\x18\x00\x00\x00\x00\x00\x01\x00\x00\x00\xa4\x81\x97\x00\x00\x00src/donut2.cUT\x05\x00\x03:p\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x03\x00\x03\x00\xeb\x00\x00\x00\xcf\x04\x00\x00\x00\x00'
+# dump the zipfile data to a file to test
+# f = open('/var/www/test.zip', 'wb')
+# f.write(zip_file)
+# f.close()
+
+REMOTE_ZIP_FILE = 'http://localhost:8080/test.zip'
+rz = RemoteZip(REMOTE_ZIP_FILE)
+toc = rz.getTableOfContents()
+for cfile in toc:
+    print('%r: %r > %r' % (cfile, toc[cfile]['compressedsize'], toc[cfile]['uncompressedsize']))
+content = rz.extractFile(b'src/donut2.c')
+print(content.decode('utf-8'))