From 46f13d895e55cb01b7888584e0f220d5e302b00e Mon Sep 17 00:00:00 2001 From: Leonid Ryzhyk Date: Sat, 20 Feb 2021 00:38:09 -0800 Subject: [PATCH] Ordered iterators. Add ordered iterators over HAMT and `HashSet`. Given two sets with the same elements and the same hashers, the new iterators enumerate these sets in the same order. This is in contrast to existing iterators that do not guarantee consistent ordering for elements with the same hash values. Consistent ordering is achieved by sorting collision nodes on the fly during iteration. We use the new iterators to fix the implementation of `Ord`, `PartialOrd`, and `Hash` for `HashSet` (https://github.com/bodil/im-rs/issues/175). --- benches/native.rs | 2 +- src/hash/set.rs | 145 +++++++++++++++++++++++++++++++++++---- src/nodes/hamt.rs | 171 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 305 insertions(+), 13 deletions(-) diff --git a/benches/native.rs b/benches/native.rs index 36db8f1..dcd3f5d 100644 --- a/benches/native.rs +++ b/benches/native.rs @@ -8,7 +8,7 @@ extern crate im; extern crate rand; extern crate test; -use rand::{rngs::SmallRng, SeedableRng, Rng}; +use rand::{rngs::SmallRng, Rng, SeedableRng}; use std::collections::{BTreeMap, HashMap, VecDeque}; use std::iter::FromIterator; use test::Bencher; diff --git a/src/hash/set.rs b/src/hash/set.rs index 4f1dfee..34ea4b4 100644 --- a/src/hash/set.rs +++ b/src/hash/set.rs @@ -31,10 +31,13 @@ use std::iter::FusedIterator; use std::iter::{FromIterator, IntoIterator, Sum}; use std::ops::{Add, Deref, Mul}; -use crate::nodes::hamt::{hash_key, Drain as NodeDrain, HashValue, Iter as NodeIter, Node}; +use crate::nodes::hamt::{ + hash_key, Drain as NodeDrain, HashValue, Iter as NodeIter, Node, + OrderedDrain as NodeOrderedDrain, OrderedIter as NodeOrderedIter, +}; use crate::ordset::OrdSet; -use crate::Vector; use crate::util::{Pool, PoolRef, Ref}; +use crate::Vector; /// Construct a set from a sequence of values. /// @@ -313,15 +316,44 @@ impl HashSet { /// Get an iterator over the values in a hash set. /// + /// No ordering guarantees are provided. + #[must_use] + pub fn iter(&self) -> Iter<'_, A> { + Iter { + it: NodeIter::new(&self.root, self.size), + } + } + + /// Get an ordered iterator over the values in a hash set. + /// /// Please note that the order is consistent between sets using /// the same hasher, but no other ordering guarantee is offered. /// Items will not come out in insertion order or sort order. /// They will, however, come out in the same order every time for /// the same set. #[must_use] - pub fn iter(&self) -> Iter<'_, A> { - Iter { - it: NodeIter::new(&self.root, self.size), + pub fn ordered_iter(&self) -> OrderedIter<'_, A> { + OrderedIter { + it: NodeOrderedIter::new(&self.root, self.size), + } + } +} + +impl HashSet +where +A: Hash + Clone + Ord +{ + /// Get an ordered iterator over the values in a hash set. + /// + /// Please note that the order is consistent between sets using + /// the same hasher, but no other ordering guarantee is offered. + /// Items will not come out in insertion order or sort order. + /// They will, however, come out in the same order every time for + /// the same set. + #[must_use] + pub fn into_ordered_iter(self) -> OrderedConsumingIter { + OrderedConsumingIter { + it: NodeOrderedDrain::new(&self.pool.0, self.root, self.size), } } } @@ -667,14 +699,14 @@ where impl PartialOrd for HashSet where - A: Hash + Eq + Clone + PartialOrd, + A: Hash + Eq + Clone + PartialOrd + Ord, S: BuildHasher + Default, { fn partial_cmp(&self, other: &Self) -> Option { if Ref::ptr_eq(&self.hasher, &other.hasher) { - return self.iter().partial_cmp(other.iter()); + return self.ordered_iter().partial_cmp(other.ordered_iter()); } - self.iter().partial_cmp(other.iter()) + self.ordered_iter().partial_cmp(other.ordered_iter()) } } @@ -685,22 +717,22 @@ where { fn cmp(&self, other: &Self) -> Ordering { if Ref::ptr_eq(&self.hasher, &other.hasher) { - return self.iter().cmp(other.iter()); + return self.ordered_iter().cmp(other.ordered_iter()); } - self.iter().cmp(other.iter()) + self.ordered_iter().cmp(other.ordered_iter()) } } impl Hash for HashSet where - A: Hash + Eq, + A: Hash + Eq + Ord, S: BuildHasher + Default, { fn hash(&self, state: &mut H) where H: Hasher, { - for i in self.iter() { + for i in self.ordered_iter() { i.hash(state); } } @@ -857,6 +889,35 @@ impl<'a, A> ExactSizeIterator for Iter<'a, A> {} impl<'a, A> FusedIterator for Iter<'a, A> {} +/// An ordered iterator over the elements of a set. +/// Given a deterministic hasher, this iterator yields values in a deterministic +/// order: two sets with the same elements are enumerated in the same order +/// regardless of the order in which elements were inserted in the sets. Items +/// are returned in the order of their hash values. Items with identical hash +/// values are sorted based on the `Ord` trait. +pub struct OrderedIter<'a, A> { + it: NodeOrderedIter<'a, Value>, +} + +impl<'a, A> Iterator for OrderedIter<'a, A> +where + A: 'a + Ord, +{ + type Item = &'a A; + + fn next(&mut self) -> Option { + self.it.next().map(|(v, _)| &v.0) + } + + fn size_hint(&self) -> (usize, Option) { + self.it.size_hint() + } +} + +impl<'a, A: Ord> ExactSizeIterator for OrderedIter<'a, A> {} + +impl<'a, A: Ord> FusedIterator for OrderedIter<'a, A> {} + /// A consuming iterator over the elements of a set. pub struct ConsumingIter where @@ -884,6 +945,32 @@ impl ExactSizeIterator for ConsumingIter where A: Hash + Eq + Clone {} impl FusedIterator for ConsumingIter where A: Hash + Eq + Clone {} +/// An ordered consuming iterator over the elements of a set. +pub struct OrderedConsumingIter + where A: Hash + Eq + Clone + Ord, +{ + it: NodeOrderedDrain>, +} + +impl Iterator for OrderedConsumingIter +where + A: Hash + Eq + Clone + Ord, +{ + type Item = A; + + fn next(&mut self) -> Option { + self.it.next().map(|(v, _)| v.0) + } + + fn size_hint(&self) -> (usize, Option) { + self.it.size_hint() + } +} + +impl ExactSizeIterator for OrderedConsumingIter where A: Hash + Eq + Clone + Ord {} + +impl FusedIterator for OrderedConsumingIter where A: Hash + Eq + Clone + Ord {} + // Iterator conversions impl FromIterator for HashSet @@ -1063,6 +1150,7 @@ mod test { use crate::test::LolHasher; use ::proptest::num::i16; use ::proptest::proptest; + use std::collections::hash_map::DefaultHasher; use std::hash::BuildHasherDefault; #[test] @@ -1119,6 +1207,39 @@ mod test { } } + #[test] + fn consistent_ord() { + let mut set1 = HashSet::with_hasher(>::default()); + let mut set2 = HashSet::with_hasher(>::default()); + + // Create two sets with identical elements but different insertion order. + // The sets are big enough to trigger hash collisions. + for i in 0..50_001 { + set1.insert(i); + set2.insert(50_000 - i); + } + + // The sets are the same according to Eq, Ord, and Hash. + assert_eq!(set1, set2); + assert_eq!(set1.cmp(&set2), Ordering::Equal); + + let mut s = DefaultHasher::new(); + set1.hash(&mut s); + let hash1 = s.finish(); + + let mut s = DefaultHasher::new(); + set2.hash(&mut s); + let hash2 = s.finish(); + assert_eq!(hash1, hash2); + + // Ordered iterators must yield elements in the same order. + let v1: Vec<_> = set1.into_ordered_iter().collect(); + let v2: Vec<_> = set2.into_ordered_iter().collect(); + // Don't use `assert_eq!`, as on error it allocates a vector + // of size `v1.len() * v2.len()` and runs out of memory. + assert!(v1 == v2); + } + proptest! { #[test] fn proptest_a_set(ref s in hash_set(".*", 10..100)) { diff --git a/src/nodes/hamt.rs b/src/nodes/hamt.rs index 8ee6157..0194739 100644 --- a/src/nodes/hamt.rs +++ b/src/nodes/hamt.rs @@ -549,6 +549,92 @@ impl<'a, A> ExactSizeIterator for Iter<'a, A> where A: 'a {} impl<'a, A> FusedIterator for Iter<'a, A> where A: 'a {} +// Ordered Ref iterator. +// Given a deterministic hasher, this iterator yields values in a deterministic +// order: two sets with the same elements are enumerated in the same order +// regardless of the order in which elements were inserted in the sets. Items +// are returned in the order of their hash values. Items with identical hash +// values are sorted based on the `Ord` trait. + +pub(crate) struct OrderedIter<'a, A> { + count: usize, + stack: Vec, HashWidth>>, + current: ChunkIter<'a, Entry, HashWidth>, + collision: Option<(HashBits, std::vec::IntoIter<&'a A>)>, +} + +impl<'a, A> OrderedIter<'a, A> +where + A: 'a, +{ + pub(crate) fn new(root: &'a Node, size: usize) -> Self { + OrderedIter { + count: size, + stack: Vec::with_capacity((HASH_WIDTH / HASH_SHIFT) + 1), + current: root.data.iter(), + collision: None, + } + } +} + +impl<'a, A> Iterator for OrderedIter<'a, A> +where + A: 'a + Ord, +{ + type Item = (&'a A, HashBits); + + fn next(&mut self) -> Option { + if self.count == 0 { + return None; + } + if self.collision.is_some() { + if let Some((hash, ref mut coll)) = self.collision { + match coll.next() { + None => {} + Some(value) => { + self.count -= 1; + return Some((value, hash)); + } + } + } + self.collision = None; + return self.next(); + } + match self.current.next() { + Some(Entry::Value(value, hash)) => { + self.count -= 1; + Some((value, *hash)) + } + Some(Entry::Node(child)) => { + let current = mem::replace(&mut self.current, child.data.iter()); + self.stack.push(current); + self.next() + } + Some(Entry::Collision(coll)) => { + let mut refs: Vec<&'a A> = coll.data.iter().collect(); + refs.sort(); + self.collision = Some((coll.hash, refs.into_iter())); + self.next() + } + None => match self.stack.pop() { + None => None, + Some(iter) => { + self.current = iter; + self.next() + } + }, + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.count, Some(self.count)) + } +} + +impl<'a, A> ExactSizeIterator for OrderedIter<'a, A> where A: 'a + Ord {} + +impl<'a, A> FusedIterator for OrderedIter<'a, A> where A: 'a + Ord {} + // Mut ref iterator pub(crate) struct IterMut<'a, A> { @@ -713,6 +799,91 @@ impl ExactSizeIterator for Drain where A: Clone {} impl FusedIterator for Drain where A: Clone {} +// Ordered consuming iterator + +pub(crate) struct OrderedDrain +where + A: HashValue, +{ + count: usize, + pool: Pool>, + stack: Vec>>, + current: PoolRef>, + collision: Option>, +} + +impl OrderedDrain +where + A: HashValue, +{ + pub(crate) fn new(pool: &Pool>, root: PoolRef>, size: usize) -> Self { + OrderedDrain { + count: size, + pool: pool.clone(), + stack: vec![], + current: root, + collision: None, + } + } +} + +impl Iterator for OrderedDrain +where + A: HashValue + Clone + Ord, +{ + type Item = (A, HashBits); + + fn next(&mut self) -> Option { + if self.count == 0 { + return None; + } + if self.collision.is_some() { + if let Some(ref mut coll) = self.collision { + if let Some(value) = coll.data.pop() { + self.count -= 1; + return Some((value, coll.hash)); + } + } + self.collision = None; + return self.next(); + } + match PoolRef::make_mut(&self.pool, &mut self.current).data.pop() { + Some(Entry::Value(value, hash)) => { + self.count -= 1; + Some((value, hash)) + } + Some(Entry::Collision(coll_ref)) => { + let mut coll = clone_ref(coll_ref); + // Sort the vector to ensure that elements with the same hash value + // come out in the same order regardless of the insertion order. + coll.data.sort(); + self.collision = Some(coll); + self.next() + } + Some(Entry::Node(child)) => { + let parent = mem::replace(&mut self.current, child); + self.stack.push(parent); + self.next() + } + None => match self.stack.pop() { + None => None, + Some(parent) => { + self.current = parent; + self.next() + } + }, + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.count, Some(self.count)) + } +} + +impl ExactSizeIterator for OrderedDrain where A: Clone + Ord {} + +impl FusedIterator for OrderedDrain where A: Clone + Ord {} + impl fmt::Debug for Node { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { write!(f, "Node[ ")?;