diff --git a/pyremotezip/remotezip.py b/pyremotezip/remotezip.py index 441a1fc..7783a5b 100644 --- a/pyremotezip/remotezip.py +++ b/pyremotezip/remotezip.py @@ -1,10 +1,15 @@ # -*- coding: utf-8 -*- -import urllib2 import zlib +import struct -from urllib2 import HTTPError -from struct import unpack +from urllib.request import Request, urlopen +from urllib.error import HTTPError + +import logging + + +logger = logging.getLogger('pyremotezip') class RemoteZip(object): @@ -19,19 +24,36 @@ def __init__(self, zipURI): self.filesize = None self.zipURI = zipURI self.tableOfContents = None + self.fileList = None def __file_exists(self): # check if file exists - headRequest = urllib2.Request(self.zipURI) + headRequest = Request(self.zipURI) headRequest.get_method = lambda: 'HEAD' try: - response = urllib2.urlopen(headRequest) - self.filesize = int(response.info().getheader('Content-Length')) + response = urlopen(headRequest) + self.filesize = int(response.getheader('Content-Length')) return True except HTTPError as e: - print '%s' % e + logger.error('Unable to retrieve remote zip: %r', e) return False + def __request_range(self, start, offset): + """ This function makes a range http request + """ + + request = Request(self.zipURI) + request.headers['Range'] = 'bytes=%s-%s' % (start, offset, ) + handle = urlopen(request) + + # make sure the response is ranged + return_range = handle.getheader('Content-Range') + if return_range != "bytes %d-%d/%s" % (start, offset, self.filesize, ): + raise Exception("Ranged requests are not supported for this URI") + # got here? we're fine, read the contents + raw_bytes = handle.read() + return raw_bytes + def getTableOfContents(self): """ This function populates the internal tableOfContents list with the contents @@ -43,66 +65,63 @@ def getTableOfContents(self): raise FileNotFoundException() # now request bytes from that size minus a 64kb max zip directory length - request = urllib2.Request(self.zipURI) - start = self.filesize - (65536) + if self.filesize < 64 * 2 ** 10: + start = 0 # if the file is small than 64kb get everything + else: + start = self.filesize - (64 * 2 ** 10) end = self.filesize - 1 - request.headers['Range'] = "bytes=%s-%s" % (start, end) - handle = urllib2.urlopen(request) - - # make sure the response is ranged - return_range = handle.headers.get('Content-Range') - if return_range != "bytes %d-%d/%s" % (start, end, self.filesize): - raise Exception("Ranged requests are not supported for this URI") - - # got here? we're fine, read the contents - raw_bytes = handle.read() - - # now find the end-of-directory: 06054b50 - # we're on little endian maybe - directory_end = raw_bytes.find("\x50\x4b\x05\x06") + raw_bytes = self.__request_range(start, end) + + directory_size, directory_start = self.__read_central_directory_size_and_offset(raw_bytes) + table_of_contents, file_list = self.__read_central_directory(raw_bytes[directory_start:directory_start + directory_size]) + self.tableOfContents = table_of_contents + self.fileList = file_list + return self.tableOfContents + + def __read_central_directory_size_and_offset(self, raw_bytes): + ''' reads the end of central directory structure + ''' + # find the end-of-directory: 06054b50 + directory_end = raw_bytes.find(b"\x50\x4b\x05\x06") if directory_end < 0: - raise Exception("Could not find end of directory") - - # now find the size of the directory: offset 12, 4 bytes - # directory_size = unpack("i", raw_bytes[directory_end+12:directory_end+16])[0] - - # and find the offset from start of file where it can be found - directory_start = unpack("i", raw_bytes[directory_end + 16: directory_end + 20])[0] - - # find the data in the raw_bytes - current_start = directory_start - start - filestart = 0 - compressedsize = 0 - tableOfContents = [] - - try: - while True: - # get file name size (n), extra len (m) and comm len (k) - zip_n = unpack("H", raw_bytes[current_start + 28: current_start + 28 + 2])[0] - zip_m = unpack("H", raw_bytes[current_start + 30: current_start + 30 + 2])[0] - zip_k = unpack("H", raw_bytes[current_start + 32: current_start + 32 + 2])[0] - - filename = raw_bytes[current_start + 46: current_start + 46 + zip_n] - - # check if this is the index file - filestart = unpack("I", raw_bytes[current_start + 42: current_start + 42 + 4])[0] - compressedsize = unpack("I", raw_bytes[current_start + 20: current_start + 20 + 4])[0] - uncompressedsize = unpack("I", raw_bytes[current_start + 24: current_start + 24 + 4])[0] - tableItem = { - 'filename': filename, - 'compressedsize': compressedsize, - 'uncompressedsize': uncompressedsize, - 'filestart': filestart - } - tableOfContents.append(tableItem) - - # not this file, move along - current_start = current_start + 46 + zip_n + zip_m + zip_k - except: - pass - - self.tableOfContents = tableOfContents - return tableOfContents + raise Exception('Could not find EOCD') + eocd = raw_bytes[directory_end:] + cd_size, cd_offset = struct.unpack('ii', eocd[12:20]) + return cd_size, cd_offset + + def __read_central_directory(self, raw_bytes): + ''' reads the zip file central directory structure + ''' + table_of_contents = {} + file_list = [] + while True: + try: + comp_size, uncomp_size = struct.unpack('II', raw_bytes[20:20 + 8]) + except struct.error: + break + n, m, k = struct.unpack('HHH', raw_bytes[28:28 + 6]) + offset, = struct.unpack('I', raw_bytes[42:42 + 4]) + filename = raw_bytes[46:46 + n] + table_of_contents[filename] = {'filename': filename, + 'compressedsize': comp_size, + 'uncompressedsize': uncomp_size, + 'filestart': offset} + file_list.append(filename) + raw_bytes = raw_bytes[46 + n + m + k:] + return table_of_contents, file_list + + def __read_data_descriptor(self, local_file_header, compressed_file_start): + start_compressed_data = local_file_header[compressed_file_start:] + start_data_descriptor = start_compressed_data.find(b'\x50\x4b\x07\x08') + if start_data_descriptor > 0: + crc, comp_size, uncomp_size = struct.unpack('III', + local_file_header[start_data_descriptor + 4:start_data_descriptor + 16]) + return crc, comp_size, uncomp_size + + def __read_local_file_header(self, raw_bytes): + zip_n, zip_m = struct.unpack("HH", raw_bytes[26:30]) + header_size = 30 + zip_n + zip_m + return header_size def extractFile(self, filename): """ @@ -110,38 +129,26 @@ def extractFile(self, filename): the entire zip file. The filename argument should match whatever is in the 'filename' key of the tableOfContents. """ - files = [x for x in self.tableOfContents if x['filename'] == filename] - if len(files) == 0: - raise FileNotFoundException() + if filename not in self.tableOfContents: + raise FileNotFoundException('Requested file not found: %r' % (filename)) - fileRecord = files[0] + file_offset = self.tableOfContents[filename]['filestart'] + file_compressed_size = self.tableOfContents[filename]['compressedsize'] + file_uncompressed_size = self.tableOfContents[filename]['uncompressedsize'] - # got here? need to fetch the file size - metaheadroom = 1024 # should be enough - request = urllib2.Request(self.zipURI) - end = fileRecord['filestart'] + fileRecord['compressedsize'] + metaheadroom - request.headers['Range'] = "bytes=%s-%s" % (fileRecord['filestart'], end) - handle = urllib2.urlopen(request) - filedata = handle.read() + local_file_header = self.__request_range(file_offset, file_offset + 30) + local_file_header_size = self.__read_local_file_header(local_file_header) + compressed_data = self.__request_range(file_offset + local_file_header_size, file_offset + local_file_header_size + file_compressed_size - 1) - # find start of raw file data - zip_n = unpack("H", filedata[26:28])[0] - zip_m = unpack("H", filedata[28:30])[0] - - # check compressed size - comp_size = unpack("I", filedata[18:22])[0] - if comp_size != fileRecord['compressedsize']: - raise Exception("Something went wrong. Directory and file header disagree of compressed file size") - - raw_zip_data = filedata[30 + zip_n + zip_m: 30 + zip_n + zip_m + comp_size] - uncompressed_data = "" + if file_compressed_size == file_uncompressed_size: + return compressed_data + uncompressed_data = b'' dec = zlib.decompressobj(-zlib.MAX_WBITS) - for chunk in raw_zip_data: - rv = dec.decompress(chunk) + for chunk in compressed_data: + rv = dec.decompress(bytes([chunk])) if rv: uncompressed_data = uncompressed_data + rv - return uncompressed_data diff --git a/test_remotezip.py b/test_remotezip.py new file mode 100644 index 0000000..e51ec71 --- /dev/null +++ b/test_remotezip.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +from pyremotezip.remotezip import RemoteZip + +zip_file = b'PK\x03\x04\n\x00\x00\x00\x00\x00\xd7l\xc2D%n\x1c\xa0\x16\x00\x00\x00\x16\x00\x00\x00\t\x00\x1c\x00README.mdUT\t\x00\x03Vp\x8cS\xc4\xda\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00README FILE\n\ndonut2.c\nPK\x03\x04\n\x00\x00\x00\x00\x00\x9c\xa9\xc2D\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x1c\x00src/UT\t\x00\x03\xc7\xda\x8cS#\xdb\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00PK\x03\x04\x14\x00\x00\x00\x08\x00\xc9l\xc2D\x99\x91v\n\xf2\x03\x00\x00\xac\x06\x00\x00\x0c\x00\x1c\x00src/donut2.cUT\t\x00\x03:p\x8cS\xc7\xda\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00MUks\xa2H\x14\xfd\xce\xaf\xe8tb\xec\x86F\x1bDcd\x1b7V\xac\xdd\xa4fL\xed#&\x19d\\$j\xf0\x01\x8a\x9a@\x94\xf9\xed\xdb `\xb0\xca\xe2\xdc{\xcf\xed\xfbl\x86$$\x11\xf1\xc1\xf1!=\xddy\xb3\x83\x0c\x8dL\xa5\xa9Q+C\xfa\ny\xc4\xc1\xc2~\xe2\x07H\xf72\xa9,\xeb\xa1$\xe1\x0c9\x8c)\xb4\x1deHbMJB&\xf8\xb2\xd2\n\r\x96Ii\xbbI\x8d\xb0\x9d3\xceX\xf9W9G#3\x92B\x8b\tN\x8b&\xbf\xfc\xe4\xd8Ak\xb2\xc8c\x0c\x88O\xde\x8b\x13y\xb8\xe2\x82\x08\x19\x14\x03=\x89/CH_\x1b\x8c\xea\x19\xc2k\x86\xe0\r\xcc\x10|\xf9\x16\xfc\xf8\ts"\xfc\xf8\xd9\xee>\xe7\xcaT\xe2\xf9\xef\xde\xf8(\x81\xa3\x97~A|}\x88\xbfuO\xc4}\xf4\xb4\xf8\xc8\x94\xf0~1\xbc\xb7\x0f\xe6.\xb0>\xfd\xf7\xd5.\x11\xc3\x82\xe8Z\xe3Ct"\xda\xfe\xb0\x93\xfa\x87\xfefy7\x8d\x03\xcb\n\xac\xe5\xe1c\xfe\xe31\xde?\xc4\xa7X`\x18m,k \x84\x87\xc8\xe6\x0e\n\xf1fy8\xecw\xe3E|\x08\xf6\xd1\x9d\x93\xc0\xc86\xf7\xbb.L\xc2Z\xbf\x0c\x84\x8f\xc3\xf4\x0f?\xc8\x83\xeb\x07\xfd\'\xd7\x0f\xdc\xbb\xf8/\xdb\xd9\xef;w/\xe6\xe6y6\xdb\xf4\x9f,\xfb{\xc2\xf9w l\x13\xc1&\xcbz\xd3\x1f><\xce\xff~\\\xfc\xc3i\xfd\xe7\xd5p\xfd\xf0\xdd\xe8\xee;\xf6k\x07\x9a\xc3j\x03\x00K\x16\x1a\xaaa\x0c%)!\x94\x1a\x97J;0\xd7Vk\xc1\xff\xb0\xec\xeb\xc1x\xbb\x0b<\xb0\xd6\xe3.\xb2q\xda\x1a\x94\x17\xc0g d6\x89\x18%C\xde%\x85^\xd7\x8d\xa1\x8em\x06 \xf8\xadB\xfe+W1\x1axr\x12\xd8`0\xfc\x05\xcd\x9c\xe8\xf0\xce**\x81g%\xb1Z\xc6\xe75X\x14\x19Jr\x834!\x81\x03\x88*\x17\x10@\xaa\xa8Z\xfdTrpyu\r\x89\xd6\xc0\x92\xa2Y\x04\x80\x15:u\xe7\xfc\xa2tYF\x98\xb6\xb4\xda5\x80&p\x90Bs"\x01\xf0\x12\x89\xe7\xa4RUj\xdc\x1f \xb0\x8c\x0b\xa2T\x92/\xa8:\x80g\x00\x12\xa0i\xdc\xb7j\trM#6\xe6\xa3\x17/m\xd7Cs\x9e\xfb\xc2\xb7\xb7)\xe3\x86g\xdca\x80\x12\x97\xcc\xc8\xe7q\xc9ta\xb5\xdbn\xd2p\xe0 TF\xa6z\x0f\xb1\xaeg\xb3\x9b\xae\x9c\x0er\'c\xb6q=\x01\xdd`\x02<\xbeV\x1c\xa0\x0e&S\xe6\xf8\x1bt$p\xd5\x92k\x12A\x07\xeb\t}\xce\x04^c~\x94\xc1\xd5s}\xce\x976b\xb2B\xe5y5!4\x93e#>\xd3\x14\t\xcdKM*kE\xf2X\x04J\xa5V\x8d$\x8f\xf4\xd8\r\xa7\xd0\n\xadFdd\xce-\x06+\xe7\xd0\xf4\xa5\xde\xa5\x92\x94\xf33\x11Q=\'vQ\xe2WF\xae\xb7\xc5\xe8Z\xec\xe0\x92Z\xa7\xc7hf\xdc\xacQQ\x9b\xc6L\x9f\x81\xe4\xa6\xe0>\xafpNL7\x18\xb9\xb9\x8d\xab\xbb\xa9\x81\x9a\xe7\xef$\xf9\xa7\x99\xba\xbc\x04\xe0\x95\x15\x93\x91\x16`\x86\xc9$\xb5\xe0/o\xecUR\xc9-S\xeaU\xe4\x88o\xe2X\x9a\xa4\x96\xe2T\xaac\xb2\x10\xd2\x9a\xb9\xf9m\x02\xc8\x96%VSy"\x8e\xf5\x90iTR\xc5[\x11-\xb8\x0c\x80\xa5\xbc\x15=\x01\xf3\x89UT)#\xdcrW\xa9\xda\x93\xb6\xe2\x12\xf3\n\x86R\x93\x8a\x11\xe9\x01\xc0\x9a"B\x13Q\x18\xcb\x8e\xf8*N\x8br.\xf9\x7f*\x1a\xf3c\xa6\xf2\x82\xbfy\xc7\x08tw\x82n\x8dO\xc1\xf4-\xfci\xe6\xb7\xb2\xc5ny\xb5\x8f;\xc3K\x0e*_\xae)X!D\x96N\x13.\xb1\xf3\x8b\xdfs\xbd\xd93h\xbb\xd7\xca\x10\x1f2\xfe\xc4\x19Z\x05\xbc-\x93b\x19a\xc91\xff\xe4\xf3{|\xd4\xab\xb4M\x19\xe2\xb3\xa3\xf0\xd1\xc9\x90\xc8\x07@\xd2\x94\x9ch\x1cg)\xf7\xba\xdb~\xf9\x80\xa4\x93\xd4\x1e\xe5\xd1\xcc\xad\x96B\x8b&\'\xc1\xdcH\xc5W\xa1B\xeb\xb5|\xdcA\'\xedw\xed\x8b\xa5\x1e\xc7\xc2\xffPK\x01\x02\x1e\x03\n\x00\x00\x00\x00\x00\xd7l\xc2D%n\x1c\xa0\x16\x00\x00\x00\x16\x00\x00\x00\t\x00\x18\x00\x00\x00\x00\x00\x01\x00\x00\x00\xa4\x81\x00\x00\x00\x00README.mdUT\x05\x00\x03Vp\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00PK\x01\x02\x1e\x03\n\x00\x00\x00\x00\x00\x9c\xa9\xc2D\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x18\x00\x00\x00\x00\x00\x00\x00\x10\x00\xedAY\x00\x00\x00src/UT\x05\x00\x03\xc7\xda\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00PK\x01\x02\x1e\x03\x14\x00\x00\x00\x08\x00\xc9l\xc2D\x99\x91v\n\xf2\x03\x00\x00\xac\x06\x00\x00\x0c\x00\x18\x00\x00\x00\x00\x00\x01\x00\x00\x00\xa4\x81\x97\x00\x00\x00src/donut2.cUT\x05\x00\x03:p\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x03\x00\x03\x00\xeb\x00\x00\x00\xcf\x04\x00\x00\x00\x00' +# dump the zipfile data to a file to test +# f = open('/var/www/test.zip', 'wb') +# f.write(zip_file) +# f.close() + +REMOTE_ZIP_FILE = 'http://localhost:8080/test.zip' +rz = RemoteZip(REMOTE_ZIP_FILE) +toc = rz.getTableOfContents() +for cfile in toc: + print('%r: %r > %r' % (cfile, toc[cfile]['compressedsize'], toc[cfile]['uncompressedsize'])) +content = rz.extractFile(b'src/donut2.c') +print(content.decode('utf-8'))