Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 97 additions & 90 deletions pyremotezip/remotezip.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
# -*- coding: utf-8 -*-

import urllib2
import zlib
import struct

from urllib2 import HTTPError
from struct import unpack
from urllib.request import Request, urlopen
from urllib.error import HTTPError

import logging


logger = logging.getLogger('pyremotezip')


class RemoteZip(object):
Expand All @@ -19,19 +24,36 @@ def __init__(self, zipURI):
self.filesize = None
self.zipURI = zipURI
self.tableOfContents = None
self.fileList = None

def __file_exists(self):
# check if file exists
headRequest = urllib2.Request(self.zipURI)
headRequest = Request(self.zipURI)
headRequest.get_method = lambda: 'HEAD'
try:
response = urllib2.urlopen(headRequest)
self.filesize = int(response.info().getheader('Content-Length'))
response = urlopen(headRequest)
self.filesize = int(response.getheader('Content-Length'))
return True
except HTTPError as e:
print '%s' % e
logger.error('Unable to retrieve remote zip: %r', e)
return False

def __request_range(self, start, offset):
""" This function makes a range http request
"""

request = Request(self.zipURI)
request.headers['Range'] = 'bytes=%s-%s' % (start, offset, )
handle = urlopen(request)

# make sure the response is ranged
return_range = handle.getheader('Content-Range')
if return_range != "bytes %d-%d/%s" % (start, offset, self.filesize, ):
raise Exception("Ranged requests are not supported for this URI")
# got here? we're fine, read the contents
raw_bytes = handle.read()
return raw_bytes

def getTableOfContents(self):
"""
This function populates the internal tableOfContents list with the contents
Expand All @@ -43,105 +65,90 @@ def getTableOfContents(self):
raise FileNotFoundException()

# now request bytes from that size minus a 64kb max zip directory length
request = urllib2.Request(self.zipURI)
start = self.filesize - (65536)
if self.filesize < 64 * 2 ** 10:
start = 0 # if the file is small than 64kb get everything
else:
start = self.filesize - (64 * 2 ** 10)
end = self.filesize - 1
request.headers['Range'] = "bytes=%s-%s" % (start, end)
handle = urllib2.urlopen(request)

# make sure the response is ranged
return_range = handle.headers.get('Content-Range')
if return_range != "bytes %d-%d/%s" % (start, end, self.filesize):
raise Exception("Ranged requests are not supported for this URI")

# got here? we're fine, read the contents
raw_bytes = handle.read()

# now find the end-of-directory: 06054b50
# we're on little endian maybe
directory_end = raw_bytes.find("\x50\x4b\x05\x06")
raw_bytes = self.__request_range(start, end)

directory_size, directory_start = self.__read_central_directory_size_and_offset(raw_bytes)
table_of_contents, file_list = self.__read_central_directory(raw_bytes[directory_start:directory_start + directory_size])
self.tableOfContents = table_of_contents
self.fileList = file_list
return self.tableOfContents

def __read_central_directory_size_and_offset(self, raw_bytes):
''' reads the end of central directory structure
'''
# find the end-of-directory: 06054b50
directory_end = raw_bytes.find(b"\x50\x4b\x05\x06")
if directory_end < 0:
raise Exception("Could not find end of directory")

# now find the size of the directory: offset 12, 4 bytes
# directory_size = unpack("i", raw_bytes[directory_end+12:directory_end+16])[0]

# and find the offset from start of file where it can be found
directory_start = unpack("i", raw_bytes[directory_end + 16: directory_end + 20])[0]

# find the data in the raw_bytes
current_start = directory_start - start
filestart = 0
compressedsize = 0
tableOfContents = []

try:
while True:
# get file name size (n), extra len (m) and comm len (k)
zip_n = unpack("H", raw_bytes[current_start + 28: current_start + 28 + 2])[0]
zip_m = unpack("H", raw_bytes[current_start + 30: current_start + 30 + 2])[0]
zip_k = unpack("H", raw_bytes[current_start + 32: current_start + 32 + 2])[0]

filename = raw_bytes[current_start + 46: current_start + 46 + zip_n]

# check if this is the index file
filestart = unpack("I", raw_bytes[current_start + 42: current_start + 42 + 4])[0]
compressedsize = unpack("I", raw_bytes[current_start + 20: current_start + 20 + 4])[0]
uncompressedsize = unpack("I", raw_bytes[current_start + 24: current_start + 24 + 4])[0]
tableItem = {
'filename': filename,
'compressedsize': compressedsize,
'uncompressedsize': uncompressedsize,
'filestart': filestart
}
tableOfContents.append(tableItem)

# not this file, move along
current_start = current_start + 46 + zip_n + zip_m + zip_k
except:
pass

self.tableOfContents = tableOfContents
return tableOfContents
raise Exception('Could not find EOCD')
eocd = raw_bytes[directory_end:]
cd_size, cd_offset = struct.unpack('ii', eocd[12:20])
return cd_size, cd_offset

def __read_central_directory(self, raw_bytes):
''' reads the zip file central directory structure
'''
table_of_contents = {}
file_list = []
while True:
try:
comp_size, uncomp_size = struct.unpack('II', raw_bytes[20:20 + 8])
except struct.error:
break
n, m, k = struct.unpack('HHH', raw_bytes[28:28 + 6])
offset, = struct.unpack('I', raw_bytes[42:42 + 4])
filename = raw_bytes[46:46 + n]
table_of_contents[filename] = {'filename': filename,
'compressedsize': comp_size,
'uncompressedsize': uncomp_size,
'filestart': offset}
file_list.append(filename)
raw_bytes = raw_bytes[46 + n + m + k:]
return table_of_contents, file_list

def __read_data_descriptor(self, local_file_header, compressed_file_start):
start_compressed_data = local_file_header[compressed_file_start:]
start_data_descriptor = start_compressed_data.find(b'\x50\x4b\x07\x08')
if start_data_descriptor > 0:
crc, comp_size, uncomp_size = struct.unpack('III',
local_file_header[start_data_descriptor + 4:start_data_descriptor + 16])
return crc, comp_size, uncomp_size

def __read_local_file_header(self, raw_bytes):
zip_n, zip_m = struct.unpack("HH", raw_bytes[26:30])
header_size = 30 + zip_n + zip_m
return header_size

def extractFile(self, filename):
"""
This function will extract a single file from the remote zip without downloading
the entire zip file. The filename argument should match whatever is in the 'filename'
key of the tableOfContents.
"""
files = [x for x in self.tableOfContents if x['filename'] == filename]
if len(files) == 0:
raise FileNotFoundException()
if filename not in self.tableOfContents:
raise FileNotFoundException('Requested file not found: %r' % (filename))

fileRecord = files[0]
file_offset = self.tableOfContents[filename]['filestart']
file_compressed_size = self.tableOfContents[filename]['compressedsize']
file_uncompressed_size = self.tableOfContents[filename]['uncompressedsize']

# got here? need to fetch the file size
metaheadroom = 1024 # should be enough
request = urllib2.Request(self.zipURI)
end = fileRecord['filestart'] + fileRecord['compressedsize'] + metaheadroom
request.headers['Range'] = "bytes=%s-%s" % (fileRecord['filestart'], end)
handle = urllib2.urlopen(request)
filedata = handle.read()
local_file_header = self.__request_range(file_offset, file_offset + 30)
local_file_header_size = self.__read_local_file_header(local_file_header)
compressed_data = self.__request_range(file_offset + local_file_header_size, file_offset + local_file_header_size + file_compressed_size - 1)

# find start of raw file data
zip_n = unpack("H", filedata[26:28])[0]
zip_m = unpack("H", filedata[28:30])[0]

# check compressed size
comp_size = unpack("I", filedata[18:22])[0]
if comp_size != fileRecord['compressedsize']:
raise Exception("Something went wrong. Directory and file header disagree of compressed file size")

raw_zip_data = filedata[30 + zip_n + zip_m: 30 + zip_n + zip_m + comp_size]
uncompressed_data = ""
if file_compressed_size == file_uncompressed_size:
return compressed_data

uncompressed_data = b''
dec = zlib.decompressobj(-zlib.MAX_WBITS)
for chunk in raw_zip_data:
rv = dec.decompress(chunk)
for chunk in compressed_data:
rv = dec.decompress(bytes([chunk]))
if rv:
uncompressed_data = uncompressed_data + rv

return uncompressed_data


Expand Down
17 changes: 17 additions & 0 deletions test_remotezip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-

from pyremotezip.remotezip import RemoteZip

zip_file = b'PK\x03\x04\n\x00\x00\x00\x00\x00\xd7l\xc2D%n\x1c\xa0\x16\x00\x00\x00\x16\x00\x00\x00\t\x00\x1c\x00README.mdUT\t\x00\x03Vp\x8cS\xc4\xda\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00README FILE\n\ndonut2.c\nPK\x03\x04\n\x00\x00\x00\x00\x00\x9c\xa9\xc2D\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x1c\x00src/UT\t\x00\x03\xc7\xda\x8cS#\xdb\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00PK\x03\x04\x14\x00\x00\x00\x08\x00\xc9l\xc2D\x99\x91v\n\xf2\x03\x00\x00\xac\x06\x00\x00\x0c\x00\x1c\x00src/donut2.cUT\t\x00\x03:p\x8cS\xc7\xda\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00MUks\xa2H\x14\xfd\xce\xaf\xe8tb\xec\x86F\x1bDcd\x1b7V\xac\xdd\xa4fL\xed#&\x19d\\$j\xf0\x01\x8a\x9a@\x94\xf9\xed\xdb `\xb0\xca\xe2\xdc{\xcf\xed\xfbl\x86$$\x11\xf1\xc1\xf1!=\xddy\xb3\x83\x0c\x8dL\xa5\xa9Q+C\xfa\ny\xc4\xc1\xc2~\xe2\x07H\xf72\xa9,\xeb\xa1$\xe1\x0c9\x8c)\xb4\x1deHbMJB&\xf8\xb2\xd2\n\r\x96Ii\xbbI\x8d\xb0\x9d3\xceX\xf9W9G#3\x92B\x8b\tN\x8b&\xbf\xfc\xe4\xd8Ak\xb2\xc8c\x0c\x88O\xde\x8b\x13y\xb8\xe2\x82\x08\x19\x14\x03=\x89/CH_\x1b\x8c\xea\x19\xc2k\x86\xe0\r\xcc\x10|\xf9\x16\xfc\xf8\ts"\xfc\xf8\xd9\xee>\xe7\xcaT\xe2\xf9\xef\xde\xf8(\x81\xa3\x97~A|}\x88\xbfuO\xc4}\xf4\xb4\xf8\xc8\x94\xf0~1\xbc\xb7\x0f\xe6.\xb0>\xfd\xf7\xd5.\x11\xc3\x82\xe8Z\xe3Ct"\xda\xfe\xb0\x93\xfa\x87\xfefy7\x8d\x03\xcb\n\xac\xe5\xe1c\xfe\xe31\xde?\xc4\xa7X`\x18m,k \x84\x87\xc8\xe6\x0e\n\xf1fy8\xecw\xe3E|\x08\xf6\xd1\x9d\x93\xc0\xc86\xf7\xbb.L\xc2Z\xbf\x0c\x84\x8f\xc3\xf4\x0f?\xc8\x83\xeb\x07\xfd\'\xd7\x0f\xdc\xbb\xf8/\xdb\xd9\xef;w/\xe6\xe6y6\xdb\xf4\x9f,\xfb{\xc2\xf9w l\x13\xc1&\xcbz\xd3\x1f><\xce\xff~\\\xfc\xc3i\xfd\xe7\xd5p\xfd\xf0\xdd\xe8\xee;\xf6k\x07\x9a\xc3j\x03\x00K\x16\x1a\xaaa\x0c%)!\x94\x1a\x97J;0\xd7Vk\xc1\xff\xb0\xec\xeb\xc1x\xbb\x0b<\xb0\xd6\xe3.\xb2q\xda\x1a\x94\x17\xc0g d6\x89\x18%C\xde%\x85^\xd7\x8d\xa1\x8em\x06 \xf8\xadB\xfe+W1\x1axr\x12\xd8`0\xfc\x05\xcd\x9c\xe8\xf0\xce**\x81g%\xb1Z\xc6\xe75X\x14\x19Jr\x834!\x81\x03\x88*\x17\x10@\xaa\xa8Z\xfdTrpyu\r\x89\xd6\xc0\x92\xa2Y\x04\x80\x15:u\xe7\xfc\xa2tYF\x98\xb6\xb4\xda5\x80&p\x90Bs"\x01\xf0\x12\x89\xe7\xa4RUj\xdc\x1f \xb0\x8c\x0b\xa2T\x92/\xa8:\x80g\x00\x12\xa0i\xdc\xb7j\trM#6\xe6\xa3\x17/m\xd7Cs\x9e\xfb\xc2\xb7\xb7)\xe3\x86g\xdca\x80\x12\x97\xcc\xc8\xe7q\xc9ta\xb5\xdbn\xd2p\xe0 TF\xa6z\x0f\xb1\xaeg\xb3\x9b\xae\x9c\x0er\'c\xb6q=\x01\xdd`\x02<\xbeV\x1c\xa0\x0e&S\xe6\xf8\x1bt$p\xd5\x92k\x12A\x07\xeb\t}\xce\x04^c~\x94\xc1\xd5s}\xce\x976b\xb2B\xe5y5!4\x93e#>\xd3\x14\t\xcdKM*kE\xf2X\x04J\xa5V\x8d$\x8f\xf4\xd8\r\xa7\xd0\n\xadFdd\xce-\x06+\xe7\xd0\xf4\xa5\xde\xa5\x92\x94\xf33\x11Q=\'vQ\xe2WF\xae\xb7\xc5\xe8Z\xec\xe0\x92Z\xa7\xc7hf\xdc\xacQQ\x9b\xc6L\x9f\x81\xe4\xa6\xe0>\xafpNL7\x18\xb9\xb9\x8d\xab\xbb\xa9\x81\x9a\xe7\xef$\xf9\xa7\x99\xba\xbc\x04\xe0\x95\x15\x93\x91\x16`\x86\xc9$\xb5\xe0/o\xecUR\xc9-S\xeaU\xe4\x88o\xe2X\x9a\xa4\x96\xe2T\xaac\xb2\x10\xd2\x9a\xb9\xf9m\x02\xc8\x96%VSy"\x8e\xf5\x90iTR\xc5[\x11-\xb8\x0c\x80\xa5\xbc\x15=\x01\xf3\x89UT)#\xdcrW\xa9\xda\x93\xb6\xe2\x12\xf3\n\x86R\x93\x8a\x11\xe9\x01\xc0\x9a"B\x13Q\x18\xcb\x8e\xf8*N\x8br.\xf9\x7f*\x1a\xf3c\xa6\xf2\x82\xbfy\xc7\x08tw\x82n\x8dO\xc1\xf4-\xfci\xe6\xb7\xb2\xc5ny\xb5\x8f;\xc3K\x0e*_\xae)X!D\x96N\x13.\xb1\xf3\x8b\xdfs\xbd\xd93h\xbb\xd7\xca\x10\x1f2\xfe\xc4\x19Z\x05\xbc-\x93b\x19a\xc91\xff\xe4\xf3{|\xd4\xab\xb4M\x19\xe2\xb3\xa3\xf0\xd1\xc9\x90\xc8\x07@\xd2\x94\x9ch\x1cg)\xf7\xba\xdb~\xf9\x80\xa4\x93\xd4\x1e\xe5\xd1\xcc\xad\x96B\x8b&\'\xc1\xdcH\xc5W\xa1B\xeb\xb5|\xdcA\'\xedw\xed\x8b\xa5\x1e\xc7\xc2\xffPK\x01\x02\x1e\x03\n\x00\x00\x00\x00\x00\xd7l\xc2D%n\x1c\xa0\x16\x00\x00\x00\x16\x00\x00\x00\t\x00\x18\x00\x00\x00\x00\x00\x01\x00\x00\x00\xa4\x81\x00\x00\x00\x00README.mdUT\x05\x00\x03Vp\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00PK\x01\x02\x1e\x03\n\x00\x00\x00\x00\x00\x9c\xa9\xc2D\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x18\x00\x00\x00\x00\x00\x00\x00\x10\x00\xedAY\x00\x00\x00src/UT\x05\x00\x03\xc7\xda\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00PK\x01\x02\x1e\x03\x14\x00\x00\x00\x08\x00\xc9l\xc2D\x99\x91v\n\xf2\x03\x00\x00\xac\x06\x00\x00\x0c\x00\x18\x00\x00\x00\x00\x00\x01\x00\x00\x00\xa4\x81\x97\x00\x00\x00src/donut2.cUT\x05\x00\x03:p\x8cSux\x0b\x00\x01\x04\xf5\x01\x00\x00\x04\x14\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x03\x00\x03\x00\xeb\x00\x00\x00\xcf\x04\x00\x00\x00\x00'
# dump the zipfile data to a file to test
# f = open('/var/www/test.zip', 'wb')
# f.write(zip_file)
# f.close()

REMOTE_ZIP_FILE = 'http://localhost:8080/test.zip'
rz = RemoteZip(REMOTE_ZIP_FILE)
toc = rz.getTableOfContents()
for cfile in toc:
print('%r: %r > %r' % (cfile, toc[cfile]['compressedsize'], toc[cfile]['uncompressedsize']))
content = rz.extractFile(b'src/donut2.c')
print(content.decode('utf-8'))