Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
1a810db
Drop unsafe loops. On Rust 1.27 performance degradation is within mea…
Shnatsel Jun 22, 2018
488db57
Exit earlier on invalid run length, without leaving behind a buffer w…
Shnatsel Jun 24, 2018
a5e29c8
Fix formatting
Shnatsel Jun 24, 2018
bc3916d
Fix formatting in one more place
Shnatsel Jun 24, 2018
3715c4c
Add semicolon
Shnatsel Jun 24, 2018
11c73f2
run_len_dist: validate that dist is not 0. Fixes information disclosu…
Shnatsel Jun 25, 2018
8f22601
::std::intrinsics now requires 'core_intrinsics' feature, not 'core'
Shnatsel Jun 25, 2018
4e5cc94
Remove use of 'abort' intrinsic now that panic!() is optimized by the…
Shnatsel Jun 25, 2018
dd20d3e
Remove 'unstable' feature from Cargo.toml
Shnatsel Jun 25, 2018
d56abee
Replace panic!() with unreachable!(). Thank you @hauleth for the advice
Shnatsel Jun 25, 2018
61c2c1c
Revert "Remove 'unstable' feature from Cargo.toml"
Shnatsel Jun 26, 2018
d476136
Add an assert() before the last remaining unsafe {}; no performance p…
Shnatsel Jun 26, 2018
fdee4de
Add another assert against potentially reading from uninitialized memory
Shnatsel Jun 26, 2018
9c8f582
Add a comment explaining why `if dist < 1` check is there
Shnatsel Jun 26, 2018
a2955c8
Add an assert for dist correctness right before it's used. Surprising…
Shnatsel Jun 26, 2018
f4418ea
More comments warning the unwary about the one and only unsafe block
Shnatsel Jun 26, 2018
baf9c7e
Drop a wall of text in a comment, it is not helpful. There is an asse…
Shnatsel Jun 26, 2018
a36a517
Explain what each assert actually does
Shnatsel Jun 26, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ keywords = ["deflate", "decompression", "compression", "piston"]

[features]
default = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this needed at all? I think all it did was disable unstable by default.

unstable = []

[dependencies]
adler32 = "1.0.2"
76 changes: 16 additions & 60 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@
//! }
//! ```

#![cfg_attr(feature = "unstable", feature(core))]

extern crate adler32;

use std::cmp;
Expand Down Expand Up @@ -182,16 +180,6 @@ struct BitStream<'a> {
state: BitState,
}

// Use this instead of triggering a panic (that will unwind).
#[cfg(feature = "unstable")]
fn abort() -> ! {
unsafe { ::std::intrinsics::abort() }
}
#[cfg(not(feature = "unstable"))]
fn abort() -> ! {
panic!()
}

#[cfg(debug)]
macro_rules! debug { ($($x:tt)*) => (println!($($x)*)) }
#[cfg(not(debug))]
Expand Down Expand Up @@ -224,10 +212,7 @@ impl<'a> BitStream<'a> {
return false;
}
if n > 8 && self.state.n < n {
if n > 16 {
// HACK(eddyb) in place of a static assert.
abort();
}
assert!(n <= 16);
if !self.use_byte() {
return false;
}
Expand All @@ -248,10 +233,7 @@ impl<'a> BitStream<'a> {
}

fn take(&mut self, n: u8) -> Option<u8> {
if n > 8 {
// HACK(eddyb) in place of a static assert.
abort();
}
assert!(n <= 8);
self.take16(n).map(|v: u16| v as u8)
}

Expand Down Expand Up @@ -404,7 +386,7 @@ impl CodeLengthReader {
self.result.push(0);
}
}
_ => abort(),
_ => panic!(),
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be better to use unreachable!() macro? It would provide more meaningful name.

}
}
Ok(true)
Expand Down Expand Up @@ -625,6 +607,9 @@ impl InflateStream {
fn run_len_dist(&mut self, len: u16, dist: u16) -> Result<Option<u16>, String> {
debug!("RLE -{}; {} (cap={} len={})", dist, len,
self.buffer.capacity(), self.buffer.len());
if dist < 1 {
return Err("invalid run length in stream".to_owned());
}
let buffer_size = self.buffer.capacity() as u16;
let len = if self.pos < dist {
// Handle copying from ahead, until we hit the end reading.
Expand All @@ -638,26 +623,13 @@ impl InflateStream {
return Err("run length distance is bigger than the window size".to_owned());
}
let forward = buffer_size - dist;
// assert for unsafe code:
if pos_end + forward > self.buffer.len() as u16 {
return Err("invalid run length in stream".to_owned());
}
unsafe {
// HACK(eddyb) avoid bound checks, LLVM can't optimize these.
let buffer = self.buffer.as_mut_ptr();
let dst_end = buffer.offset(pos_end as isize);
let mut dst = buffer.offset(self.pos as isize);
let mut src = dst.offset(forward as isize);
while dst < dst_end {
*dst = *src;
dst = dst.offset(1);
src = src.offset(1);
}

for i in self.pos as usize..pos_end as usize {
self.buffer[i] = self.buffer[i + forward as usize];
}
// for i in self.pos as usize..pos_end as usize {
// self.buffer[i] = self.buffer[i + forward as usize]
// }
//
self.pos = pos_end;
left
} else {
Expand All @@ -671,32 +643,18 @@ impl InflateStream {
(buffer_size, Some(pos_end - buffer_size))
};

if self.pos < dist && pos_end > self.pos {
return Err("invalid run length in stream".to_owned());
}

if self.buffer.len() < pos_end as usize {
unsafe {
self.buffer.set_len(pos_end as usize);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if push is efficient enough, compared to assigning.
The capacity should be fixed, maybe just needs an assert!(pos_end as usize <= self.buffer.capacity()) to hint to LLVM.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is, replacing if self.buffer.len() < pos_end as usize {...} (and the original loop) with:

        for i in self.pos as usize..self.buffer.len().min(pos_end as usize) {
            self.buffer[i] = self.buffer[i - dist as usize];
        }
        assert!(pos_end as usize <= self.buffer.capacity());
        while self.buffer.len() < pos_end as usize {
            let x = self.buffer[self.buffer.len() - dist as usize];
            self.buffer.push(x);
        }

One interesting question would be whether we are even validating that dist != 0.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am still iterating on this part, so I didn't include it in this PR. I will try this and report the performance impact. Thanks!

I'm also going to try extend_from_slice just to see how that works. Probably poorly because the slice belongs to the same buffer, and since the vector might be reallocated the slice could be invalidated, so I'd have to clone it. Now if I had a fixed-size vector that would be guaranteed not to reallocate that might have been faster than push().

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extend_from_slice is not really relevant, because this is RLE, not just "copy from the past", so if dist > len, you start repeating by reading bytes you've just written.
Also note that length of the vector is only increased once, then the sliding window stays constantly sized, so the push loop is less performance-sensitive.

You'd need a different loop structure with something like memcpy inside, and skipping dist-long runs, if you wanted to actually take advantage of distances larger than a few bytes, but I have my suspicions that it will be significantly faster.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sadly, the code you've suggested incurs a 9-11% performance penalty on decompressing entire files, depending on the file. I have tweaked it a bit and got it to incur 8-10% penalty, here's the code:

        let upper_bound = self.buffer.len().min(pos_end as usize);
        for i in self.pos as usize..upper_bound {
            self.buffer[i] = self.buffer[i - dist as usize];
        }
        assert!(pos_end as usize <= self.buffer.capacity());
        let initial_buffer_len = self.buffer.len();
        for i in initial_buffer_len..pos_end as usize {
            let x = self.buffer[i - dist as usize];
            self.buffer.push(x);
        }

Presence or absence of assert() has no effect (in this code, I haven't tested the variant with while without assert).

I also got 10% performance overhead simply by replacing unsafe {self.buffer.set_len(pos_end as usize);} with self.buffer.resize(pos_end as usize, 0u8); in the original code, which results in a slightly more concise code than your solution.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for investigating! This is probably the single best response to me filing a security issue I've ever seen, and I didn't even have a proof of concept this time.

As for the commit message - function run_len_dist() when taken in isolation is vulnerable; the rest of the code just so happens to never call it with dist set to 0, so the crate as a whole is not vulnerable. I would prefer to note that in the commit history, and update only the "It is unclear..." part to reflect that the crate as a whole is not vulnerable. But I will drop the mention of the vulnerability if you insist.

Also, here's my memcpy-based prototype.

        if self.buffer.len() < pos_end as usize {
            self.buffer.resize(pos_end as usize, 0u8);
        }

        fill_slice_with_subslice(&mut self.buffer, (self.pos as usize - dist as usize, self.pos as usize), (self.pos as usize, pos_end as usize));
        fn fill_slice_with_subslice(slice: &mut[u8], (source_from, source_to): (usize, usize), (dest_from, dest_to): (usize, usize)) {
            let (source, destination) = if dest_from >= source_from {slice.split_at_mut(dest_from)} else {slice.split_at_mut(source_from)};
            let source = &mut source[source_from..source_to];
            let destination = &mut destination[..dest_to-dest_from];
            for i in (0..( (destination.len()) / source.len() )).map(|x| x * source.len()) {
                destination[i..source.len()+i].copy_from_slice(&source);
            }
        }

It fails some tests and I have been trying to understand why to no avail, so I'm afraid I won't be able to complete it.

I'm afraid I'm not familiar with assembler or LLVM IR, so I will not be able to inspect it in any meaningful way. Sorry. I will benchmark your suggested changes with panic=abort on nightly and report the results.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that I don't mean panic=abort, there's a "nightly" feature which changes what abort() does. Although panic=abort might itself cause further improvements, so there's probably a few combinations to test.

I wouldn't have done the copy_from_slice with a for loop, or at least not with division/multiplication - there's away to step ranges, although I'd also try a while loop that updates state instead.

But anyway, dist is not a multiple of len, you're missing something to handle len % dist (again, it should ideally be done without using %, unless you can find a way to use division that is a performance boost, but I doubt that's possible here).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've conjured up a function based on copy_from_slice() that is more efficient than a per-element loop. Using while made it a lot more readable, so thanks for the tip!

        if self.buffer.len() < pos_end as usize {
            self.buffer.resize(pos_end as usize, 0u8);
        }
        fill_slice_with_subslice(&mut self.buffer, (self.pos as usize - dist as usize, self.pos as usize), (self.pos as usize, pos_end as usize));
        fn fill_slice_with_subslice(slice: &mut[u8], (source_from, source_to): (usize, usize), (dest_from, dest_to): (usize, usize)) {
            let (source, destination) = slice.split_at_mut(dest_from); //TODO: allow destination to be lower than source
            let source = &source[source_from..source_to];
            let destination = &mut destination[..(dest_to - dest_from)];
            let mut offset = 0;
            while offset + source.len() < destination.len() {
                destination[offset..source.len()+offset].copy_from_slice(&source);
                offset += source.len();
            }
            let remaining_chunk = destination.len()-offset;
            &mut destination[offset..].copy_from_slice(&source[..remaining_chunk]);
        }

It offsets some of the costs of safe memory initialization so that switching to safe initialization would only create 5% overhead instead of 10%. If we switch the loop above to something like this too, then we'd have an entirely safe crate with the same performance as before.

I have a branch with all changes from this PR plus the optimized loop: https://github.com/Shnatsel/inflate/tree/safe-with-optimized-loop

Sadly, this function still fails one test - the line with its invocation causes an interger overflow on test "issue_30_realworld", i.e. self.pos is actually less than dist. How did it work in a simple loop that did self.buffer[i - dist as usize] with i initialized to self.pos but doesn't work here is beyond me.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think this code could be much simpler if it was duplicated between forward and backward, just like the old code - in fact, it would be very similar to the old code, just doing more than one element at a time.

Also, is there a benefit to always using the same source subslice, or is it just as efficient / more efficient to always copy the last dist bytes? There may be some cache effects here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Always copying the last dist bytes seems to be exactly as efficient as aways copying the same slice.

The code I ended up with looks like this:

        let (source, destination) = (&mut self.buffer).split_at_mut(self.pos as usize);
        let source = &source[source.len() - dist as usize..];
        let mut offset = 0;
        while offset + source.len() < destination.len() {
            destination[offset..source.len()+offset].copy_from_slice(&source);
            offset += source.len();
        }
        let remaining_chunk = destination.len()-offset;
        &mut destination[offset..].copy_from_slice(&source[..remaining_chunk]);

Which is a bit more readable. This nets 3% to 7% performance improvement.

Surprisingly, putting the same code in the other copying loop in this function actually hurts performance by 1% on my samples.

I've also tried an iterator-based version, which is concise but as slow as copying byte-by-byte:

        let (source, destination) = (&mut self.buffer).split_at_mut(self.pos as usize);
        let source = &source[source.len() - dist as usize..];
        for (d,s) in destination.chunks_mut(dist as usize).zip(source.chunks(dist as usize).cycle()) {
            let d_len = d.len(); // last chunk has a size lower than we've specified
            d.copy_from_slice(&s[..d_len]);
        }

However, I've realized that this function can return pretty much any garbage and tests won't fail. Since this creates regression potential, optimizing copying falls out of scope of this PR.

}
}

// assert for unsafe code:
if self.pos < dist && pos_end > self.pos {
return Err("invalid run length in stream".to_owned());
}
unsafe {
// HACK(eddyb) avoid bound checks, LLVM can't optimize these.
let buffer = self.buffer.as_mut_ptr();
let dst_end = buffer.offset(pos_end as isize);
let mut dst = buffer.offset(self.pos as isize);
let mut src = dst.offset(-(dist as isize));
while dst < dst_end {
*dst = *src;
dst = dst.offset(1);
src = src.offset(1);
}
for i in self.pos as usize..pos_end as usize {
self.buffer[i] = self.buffer[i - dist as usize];
}
// for i in self.pos as usize..pos_end as usize {
// self.buffer[i] = self.buffer[i - dist as usize]
// }
//
self.pos = pos_end;
Ok(left)
}
Expand All @@ -713,9 +671,7 @@ impl InflateStream {
if (self.pos as usize) < self.buffer.len() {
self.buffer[self.pos as usize] = b;
} else {
if (self.pos as usize) != self.buffer.len() {
abort();
}
assert_eq!(self.pos as usize, self.buffer.len());
self.buffer.push(b);
}
self.pos += 1;
Expand Down