3333
3434from . import igzip_lib , isal_zlib
3535
36- __all__ = ["IGzipFile" , "open" , "compress" , "decompress" , "BadGzipFile" ]
36+ __all__ = ["IGzipFile" , "open" , "compress" , "decompress" , "BadGzipFile" ,
37+ "READ_BUFFER_SIZE" ]
3738
3839_COMPRESS_LEVEL_FAST = isal_zlib .ISAL_BEST_SPEED
3940_COMPRESS_LEVEL_TRADEOFF = isal_zlib .ISAL_DEFAULT_COMPRESSION
4041_COMPRESS_LEVEL_BEST = isal_zlib .ISAL_BEST_COMPRESSION
4142
43+ #: The amount of data that is read in at once when decompressing a file.
44+ #: Increasing this value may increase performance.
45+ READ_BUFFER_SIZE = io .DEFAULT_BUFFER_SIZE
46+
4247FTEXT , FHCRC , FEXTRA , FNAME , FCOMMENT = 1 , 2 , 4 , 8 , 16
4348
4449try :
@@ -229,8 +234,8 @@ def __init__(self, fp):
229234 # Call the init method of gzip._GzipReader's parent here.
230235 # It is not very invasive and allows us to override _PaddedFile
231236 _compression .DecompressReader .__init__ (
232- self , _PaddedFile (fp ), isal_zlib . decompressobj ,
233- wbits = - isal_zlib . MAX_WBITS )
237+ self , _PaddedFile (fp ), igzip_lib . IgzipDecompressor ,
238+ hist_bits = igzip_lib . MAX_HIST_BITS , flag = igzip_lib . DECOMP_DEFLATE )
234239 # Set flag indicating start of a new member
235240 self ._new_member = True
236241 self ._last_mtime = None
@@ -241,6 +246,57 @@ def _add_read_data(self, data):
241246 self ._crc = isal_zlib .crc32 (data , self ._crc )
242247 self ._stream_size += len (data )
243248
249+ def read (self , size = - 1 ):
250+ if size < 0 :
251+ return self .readall ()
252+ # size=0 is special because decompress(max_length=0) is not supported
253+ if not size :
254+ return b""
255+
256+ # For certain input data, a single
257+ # call to decompress() may not return
258+ # any data. In this case, retry until we get some data or reach EOF.
259+ while True :
260+ if self ._decompressor .eof :
261+ # Ending case: we've come to the end of a member in the file,
262+ # so finish up this member, and read a new gzip header.
263+ # Check the CRC and file size, and set the flag so we read
264+ # a new member
265+ self ._read_eof ()
266+ self ._new_member = True
267+ self ._decompressor = self ._decomp_factory (
268+ ** self ._decomp_args )
269+
270+ if self ._new_member :
271+ # If the _new_member flag is set, we have to
272+ # jump to the next member, if there is one.
273+ self ._init_read ()
274+ if not self ._read_gzip_header ():
275+ self ._size = self ._pos
276+ return b""
277+ self ._new_member = False
278+
279+ # Read a chunk of data from the file
280+ if self ._decompressor .needs_input :
281+ buf = self ._fp .read (READ_BUFFER_SIZE )
282+ uncompress = self ._decompressor .decompress (buf , size )
283+ else :
284+ uncompress = self ._decompressor .decompress (b"" , size )
285+ if self ._decompressor .unused_data != b"" :
286+ # Prepend the already read bytes to the fileobj so they can
287+ # be seen by _read_eof() and _read_gzip_header()
288+ self ._fp .prepend (self ._decompressor .unused_data )
289+
290+ if uncompress != b"" :
291+ break
292+ if buf == b"" :
293+ raise EOFError ("Compressed file ended before the "
294+ "end-of-stream marker was reached" )
295+
296+ self ._add_read_data (uncompress )
297+ self ._pos += len (uncompress )
298+ return uncompress
299+
244300
245301# Aliases for improved compatibility with CPython gzip module.
246302GzipFile = IGzipFile
@@ -376,13 +432,18 @@ def _argument_parser():
376432 dest = "compress" ,
377433 const = False ,
378434 help = "Decompress the file instead of compressing." )
379- parser .add_argument ("-c" , "--stdout" , action = "store_true" ,
380- help = "write on standard output" )
435+ output_group = parser .add_mutually_exclusive_group ()
436+ output_group .add_argument ("-c" , "--stdout" , action = "store_true" ,
437+ help = "write on standard output" )
438+ output_group .add_argument ("-o" , "--output" ,
439+ help = "Write to this output file" )
440+ parser .add_argument ("-f" , "--force" , action = "store_true" ,
441+ help = "Overwrite output without prompting" )
381442 # -b flag not taken by either gzip or igzip. Hidden attribute. Above 32K
382443 # diminishing returns hit. _compression.BUFFER_SIZE = 8k. But 32K is about
383444 # ~6% faster.
384445 parser .add_argument ("-b" , "--buffer-size" ,
385- default = 32 * 1024 , type = int ,
446+ default = 128 * 1024 , type = int ,
386447 help = argparse .SUPPRESS )
387448 return parser
388449
@@ -392,32 +453,49 @@ def main():
392453
393454 compresslevel = args .compresslevel or _COMPRESS_LEVEL_TRADEOFF
394455
395- # Determine input file
396- if args .compress and args .file is None :
397- in_file = sys .stdin .buffer
398- elif args .compress and args .file is not None :
399- in_file = io .open (args .file , mode = "rb" )
400- elif not args .compress and args .file is None :
401- in_file = IGzipFile (mode = "rb" , fileobj = sys .stdin .buffer )
402- elif not args .compress and args .file is not None :
403- base , extension = os .path .splitext (args .file )
404- if extension != ".gz" and not args .stdout :
405- sys .exit (f"filename doesn't end in .gz: { args .file !r} . "
406- f"Cannot determine output filename." )
407- in_file = open (args .file , "rb" )
408-
409- # Determine output file
410- if args .compress and (args .file is None or args .stdout ):
411- out_file = IGzipFile (mode = "wb" , compresslevel = compresslevel ,
412- fileobj = sys .stdout .buffer )
413- elif args .compress and args .file is not None :
414- out_file = open (args .file + ".gz" , mode = "wb" ,
415- compresslevel = compresslevel )
416- elif not args .compress and (args .file is None or args .stdout ):
417- out_file = sys .stdout .buffer
418- elif not args .compress and args .file is not None :
419- out_file = io .open (base , "wb" )
456+ if args .output :
457+ out_filepath = args .output
458+ elif args .stdout :
459+ out_filepath = None # to stdout
460+ elif args .file is None :
461+ out_filepath = None # to stout
462+ else :
463+ if args .compress :
464+ out_filepath = args .file + ".gz"
465+ else :
466+ out_filepath , extension = os .path .splitext (args .file )
467+ if extension != ".gz" and not args .stdout :
468+ sys .exit (f"filename doesn't end in .gz: { args .file !r} . "
469+ f"Cannot determine output filename." )
470+ if out_filepath is not None and not args .force :
471+ if os .path .exists (out_filepath ):
472+ yes_or_no = input (f"{ out_filepath } already exists; "
473+ f"do you wish to overwrite (y/n)?" )
474+ if yes_or_no not in {"y" , "Y" , "yes" }:
475+ sys .exit ("not overwritten" )
476+
477+ if args .compress :
478+ if args .file is None :
479+ in_file = sys .stdin .buffer
480+ else :
481+ in_file = io .open (args .file , mode = "rb" )
482+ if out_filepath is not None :
483+ out_file = open (out_filepath , "wb" , compresslevel = compresslevel )
484+ else :
485+ out_file = IGzipFile (mode = "wb" , fileobj = sys .stdout .buffer ,
486+ compresslevel = compresslevel )
487+ else :
488+ if args .file :
489+ in_file = open (args .file , mode = "rb" )
490+ else :
491+ in_file = IGzipFile (mode = "rb" , fileobj = sys .stdin .buffer )
492+ if out_filepath is not None :
493+ out_file = io .open (out_filepath , mode = "wb" )
494+ else :
495+ out_file = sys .stdout .buffer
420496
497+ global READ_BUFFER_SIZE
498+ READ_BUFFER_SIZE = args .buffer_size
421499 try :
422500 while True :
423501 block = in_file .read (args .buffer_size )
0 commit comments