diff --git a/benches/native.rs b/benches/native.rs index 36db8f1..dcd3f5d 100644 --- a/benches/native.rs +++ b/benches/native.rs @@ -8,7 +8,7 @@ extern crate im; extern crate rand; extern crate test; -use rand::{rngs::SmallRng, SeedableRng, Rng}; +use rand::{rngs::SmallRng, Rng, SeedableRng}; use std::collections::{BTreeMap, HashMap, VecDeque}; use std::iter::FromIterator; use test::Bencher; diff --git a/src/hash/set.rs b/src/hash/set.rs index 4f1dfee..34ea4b4 100644 --- a/src/hash/set.rs +++ b/src/hash/set.rs @@ -31,10 +31,13 @@ use std::iter::FusedIterator; use std::iter::{FromIterator, IntoIterator, Sum}; use std::ops::{Add, Deref, Mul}; -use crate::nodes::hamt::{hash_key, Drain as NodeDrain, HashValue, Iter as NodeIter, Node}; +use crate::nodes::hamt::{ + hash_key, Drain as NodeDrain, HashValue, Iter as NodeIter, Node, + OrderedDrain as NodeOrderedDrain, OrderedIter as NodeOrderedIter, +}; use crate::ordset::OrdSet; -use crate::Vector; use crate::util::{Pool, PoolRef, Ref}; +use crate::Vector; /// Construct a set from a sequence of values. /// @@ -313,15 +316,44 @@ impl HashSet { /// Get an iterator over the values in a hash set. /// + /// No ordering guarantees are provided. + #[must_use] + pub fn iter(&self) -> Iter<'_, A> { + Iter { + it: NodeIter::new(&self.root, self.size), + } + } + + /// Get an ordered iterator over the values in a hash set. + /// /// Please note that the order is consistent between sets using /// the same hasher, but no other ordering guarantee is offered. /// Items will not come out in insertion order or sort order. /// They will, however, come out in the same order every time for /// the same set. #[must_use] - pub fn iter(&self) -> Iter<'_, A> { - Iter { - it: NodeIter::new(&self.root, self.size), + pub fn ordered_iter(&self) -> OrderedIter<'_, A> { + OrderedIter { + it: NodeOrderedIter::new(&self.root, self.size), + } + } +} + +impl HashSet +where +A: Hash + Clone + Ord +{ + /// Get an ordered iterator over the values in a hash set. + /// + /// Please note that the order is consistent between sets using + /// the same hasher, but no other ordering guarantee is offered. + /// Items will not come out in insertion order or sort order. + /// They will, however, come out in the same order every time for + /// the same set. + #[must_use] + pub fn into_ordered_iter(self) -> OrderedConsumingIter { + OrderedConsumingIter { + it: NodeOrderedDrain::new(&self.pool.0, self.root, self.size), } } } @@ -667,14 +699,14 @@ where impl PartialOrd for HashSet where - A: Hash + Eq + Clone + PartialOrd, + A: Hash + Eq + Clone + PartialOrd + Ord, S: BuildHasher + Default, { fn partial_cmp(&self, other: &Self) -> Option { if Ref::ptr_eq(&self.hasher, &other.hasher) { - return self.iter().partial_cmp(other.iter()); + return self.ordered_iter().partial_cmp(other.ordered_iter()); } - self.iter().partial_cmp(other.iter()) + self.ordered_iter().partial_cmp(other.ordered_iter()) } } @@ -685,22 +717,22 @@ where { fn cmp(&self, other: &Self) -> Ordering { if Ref::ptr_eq(&self.hasher, &other.hasher) { - return self.iter().cmp(other.iter()); + return self.ordered_iter().cmp(other.ordered_iter()); } - self.iter().cmp(other.iter()) + self.ordered_iter().cmp(other.ordered_iter()) } } impl Hash for HashSet where - A: Hash + Eq, + A: Hash + Eq + Ord, S: BuildHasher + Default, { fn hash(&self, state: &mut H) where H: Hasher, { - for i in self.iter() { + for i in self.ordered_iter() { i.hash(state); } } @@ -857,6 +889,35 @@ impl<'a, A> ExactSizeIterator for Iter<'a, A> {} impl<'a, A> FusedIterator for Iter<'a, A> {} +/// An ordered iterator over the elements of a set. +/// Given a deterministic hasher, this iterator yields values in a deterministic +/// order: two sets with the same elements are enumerated in the same order +/// regardless of the order in which elements were inserted in the sets. Items +/// are returned in the order of their hash values. Items with identical hash +/// values are sorted based on the `Ord` trait. +pub struct OrderedIter<'a, A> { + it: NodeOrderedIter<'a, Value>, +} + +impl<'a, A> Iterator for OrderedIter<'a, A> +where + A: 'a + Ord, +{ + type Item = &'a A; + + fn next(&mut self) -> Option { + self.it.next().map(|(v, _)| &v.0) + } + + fn size_hint(&self) -> (usize, Option) { + self.it.size_hint() + } +} + +impl<'a, A: Ord> ExactSizeIterator for OrderedIter<'a, A> {} + +impl<'a, A: Ord> FusedIterator for OrderedIter<'a, A> {} + /// A consuming iterator over the elements of a set. pub struct ConsumingIter where @@ -884,6 +945,32 @@ impl ExactSizeIterator for ConsumingIter where A: Hash + Eq + Clone {} impl FusedIterator for ConsumingIter where A: Hash + Eq + Clone {} +/// An ordered consuming iterator over the elements of a set. +pub struct OrderedConsumingIter + where A: Hash + Eq + Clone + Ord, +{ + it: NodeOrderedDrain>, +} + +impl Iterator for OrderedConsumingIter +where + A: Hash + Eq + Clone + Ord, +{ + type Item = A; + + fn next(&mut self) -> Option { + self.it.next().map(|(v, _)| v.0) + } + + fn size_hint(&self) -> (usize, Option) { + self.it.size_hint() + } +} + +impl ExactSizeIterator for OrderedConsumingIter where A: Hash + Eq + Clone + Ord {} + +impl FusedIterator for OrderedConsumingIter where A: Hash + Eq + Clone + Ord {} + // Iterator conversions impl FromIterator for HashSet @@ -1063,6 +1150,7 @@ mod test { use crate::test::LolHasher; use ::proptest::num::i16; use ::proptest::proptest; + use std::collections::hash_map::DefaultHasher; use std::hash::BuildHasherDefault; #[test] @@ -1119,6 +1207,39 @@ mod test { } } + #[test] + fn consistent_ord() { + let mut set1 = HashSet::with_hasher(>::default()); + let mut set2 = HashSet::with_hasher(>::default()); + + // Create two sets with identical elements but different insertion order. + // The sets are big enough to trigger hash collisions. + for i in 0..50_001 { + set1.insert(i); + set2.insert(50_000 - i); + } + + // The sets are the same according to Eq, Ord, and Hash. + assert_eq!(set1, set2); + assert_eq!(set1.cmp(&set2), Ordering::Equal); + + let mut s = DefaultHasher::new(); + set1.hash(&mut s); + let hash1 = s.finish(); + + let mut s = DefaultHasher::new(); + set2.hash(&mut s); + let hash2 = s.finish(); + assert_eq!(hash1, hash2); + + // Ordered iterators must yield elements in the same order. + let v1: Vec<_> = set1.into_ordered_iter().collect(); + let v2: Vec<_> = set2.into_ordered_iter().collect(); + // Don't use `assert_eq!`, as on error it allocates a vector + // of size `v1.len() * v2.len()` and runs out of memory. + assert!(v1 == v2); + } + proptest! { #[test] fn proptest_a_set(ref s in hash_set(".*", 10..100)) { diff --git a/src/nodes/hamt.rs b/src/nodes/hamt.rs index 8ee6157..0194739 100644 --- a/src/nodes/hamt.rs +++ b/src/nodes/hamt.rs @@ -549,6 +549,92 @@ impl<'a, A> ExactSizeIterator for Iter<'a, A> where A: 'a {} impl<'a, A> FusedIterator for Iter<'a, A> where A: 'a {} +// Ordered Ref iterator. +// Given a deterministic hasher, this iterator yields values in a deterministic +// order: two sets with the same elements are enumerated in the same order +// regardless of the order in which elements were inserted in the sets. Items +// are returned in the order of their hash values. Items with identical hash +// values are sorted based on the `Ord` trait. + +pub(crate) struct OrderedIter<'a, A> { + count: usize, + stack: Vec, HashWidth>>, + current: ChunkIter<'a, Entry, HashWidth>, + collision: Option<(HashBits, std::vec::IntoIter<&'a A>)>, +} + +impl<'a, A> OrderedIter<'a, A> +where + A: 'a, +{ + pub(crate) fn new(root: &'a Node, size: usize) -> Self { + OrderedIter { + count: size, + stack: Vec::with_capacity((HASH_WIDTH / HASH_SHIFT) + 1), + current: root.data.iter(), + collision: None, + } + } +} + +impl<'a, A> Iterator for OrderedIter<'a, A> +where + A: 'a + Ord, +{ + type Item = (&'a A, HashBits); + + fn next(&mut self) -> Option { + if self.count == 0 { + return None; + } + if self.collision.is_some() { + if let Some((hash, ref mut coll)) = self.collision { + match coll.next() { + None => {} + Some(value) => { + self.count -= 1; + return Some((value, hash)); + } + } + } + self.collision = None; + return self.next(); + } + match self.current.next() { + Some(Entry::Value(value, hash)) => { + self.count -= 1; + Some((value, *hash)) + } + Some(Entry::Node(child)) => { + let current = mem::replace(&mut self.current, child.data.iter()); + self.stack.push(current); + self.next() + } + Some(Entry::Collision(coll)) => { + let mut refs: Vec<&'a A> = coll.data.iter().collect(); + refs.sort(); + self.collision = Some((coll.hash, refs.into_iter())); + self.next() + } + None => match self.stack.pop() { + None => None, + Some(iter) => { + self.current = iter; + self.next() + } + }, + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.count, Some(self.count)) + } +} + +impl<'a, A> ExactSizeIterator for OrderedIter<'a, A> where A: 'a + Ord {} + +impl<'a, A> FusedIterator for OrderedIter<'a, A> where A: 'a + Ord {} + // Mut ref iterator pub(crate) struct IterMut<'a, A> { @@ -713,6 +799,91 @@ impl ExactSizeIterator for Drain where A: Clone {} impl FusedIterator for Drain where A: Clone {} +// Ordered consuming iterator + +pub(crate) struct OrderedDrain +where + A: HashValue, +{ + count: usize, + pool: Pool>, + stack: Vec>>, + current: PoolRef>, + collision: Option>, +} + +impl OrderedDrain +where + A: HashValue, +{ + pub(crate) fn new(pool: &Pool>, root: PoolRef>, size: usize) -> Self { + OrderedDrain { + count: size, + pool: pool.clone(), + stack: vec![], + current: root, + collision: None, + } + } +} + +impl Iterator for OrderedDrain +where + A: HashValue + Clone + Ord, +{ + type Item = (A, HashBits); + + fn next(&mut self) -> Option { + if self.count == 0 { + return None; + } + if self.collision.is_some() { + if let Some(ref mut coll) = self.collision { + if let Some(value) = coll.data.pop() { + self.count -= 1; + return Some((value, coll.hash)); + } + } + self.collision = None; + return self.next(); + } + match PoolRef::make_mut(&self.pool, &mut self.current).data.pop() { + Some(Entry::Value(value, hash)) => { + self.count -= 1; + Some((value, hash)) + } + Some(Entry::Collision(coll_ref)) => { + let mut coll = clone_ref(coll_ref); + // Sort the vector to ensure that elements with the same hash value + // come out in the same order regardless of the insertion order. + coll.data.sort(); + self.collision = Some(coll); + self.next() + } + Some(Entry::Node(child)) => { + let parent = mem::replace(&mut self.current, child); + self.stack.push(parent); + self.next() + } + None => match self.stack.pop() { + None => None, + Some(parent) => { + self.current = parent; + self.next() + } + }, + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.count, Some(self.count)) + } +} + +impl ExactSizeIterator for OrderedDrain where A: Clone + Ord {} + +impl FusedIterator for OrderedDrain where A: Clone + Ord {} + impl fmt::Debug for Node { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { write!(f, "Node[ ")?;