Skip to content

Commit ecb3fae

Browse files
committed
WIP: docs
1 parent b57fac0 commit ecb3fae

File tree

10 files changed

+241
-146
lines changed

10 files changed

+241
-146
lines changed

bloomfilter-blocked/src/Data/BloomFilter.hs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ module Data.BloomFilter (
1212

1313
import Data.BloomFilter.Classic
1414

15-
-- $example
15+
-- $example
1616
--
1717
-- This example reads a dictionary file containing one word per line,
1818
-- constructs a Bloom filter with a 1% false positive rate, and

bloomfilter-blocked/src/Data/BloomFilter/Blocked.hs

Lines changed: 67 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,17 @@
1-
-- |
1+
-- | A fast, space efficient Bloom filter implementation. A Bloom filter is a
2+
-- set-like data structure that provides a probabilistic membership test.
23
--
3-
-- A fast, space efficient Bloom filter implementation. A Bloom
4-
-- filter is a set-like data structure that provides a probabilistic
5-
-- membership test.
4+
-- * Queries do not give false negatives. When an element is added to a filter,
5+
-- a subsequent membership test will definitely return 'True'.
66
--
7-
-- * Queries do not give false negatives. When an element is added to
8-
-- a filter, a subsequent membership test will definitely return
9-
-- 'True'.
7+
-- * False positives /are/ possible. If an element has not been added to a
8+
-- filter, a membership test /may/ nevertheless indicate that the element is
9+
-- present.
1010
--
11-
-- * False positives /are/ possible. If an element has not been added
12-
-- to a filter, a membership test /may/ nevertheless indicate that
13-
-- the element is present.
14-
--
15-
1611
module Data.BloomFilter.Blocked (
12+
-- * Overview
13+
-- $overview
14+
1715
-- * Types
1816
Hash,
1917
Salt,
@@ -57,6 +55,7 @@ module Data.BloomFilter.Blocked (
5755
maxSizeBits,
5856
insert,
5957
insertMany,
58+
read,
6059

6160
-- ** Conversion
6261
freeze,
@@ -68,6 +67,7 @@ module Data.BloomFilter.Blocked (
6867
hashesWithSalt,
6968
insertHashes,
7069
elemHashes,
70+
readHashes,
7171
-- ** Prefetching
7272
prefetchInsert,
7373
prefetchElem,
@@ -80,23 +80,60 @@ import Data.Bits ((.&.))
8080
import Data.Primitive.ByteArray (MutableByteArray)
8181
import qualified Data.Primitive.PrimArray as P
8282

83-
import Data.BloomFilter.Blocked.Calc
83+
import Data.BloomFilter.Blocked.Calc (BitsPerEntry, BloomPolicy (..),
84+
BloomSize (..), FPR, NumEntries, policyFPR, policyForBits,
85+
policyForFPR, sizeForBits, sizeForFPR, sizeForPolicy)
8486
import Data.BloomFilter.Blocked.Internal hiding (deserialise)
8587
import qualified Data.BloomFilter.Blocked.Internal as Internal
8688
import Data.BloomFilter.Hash
8789

88-
import Prelude hiding (elem, notElem)
90+
import Prelude hiding (elem, notElem, read)
91+
92+
-- $setup
93+
--
94+
-- >>> import Text.Printf
95+
96+
-- $overview
97+
--
98+
-- Each of the functions for creating Bloom filters accepts a 'BloomSize'. The
99+
-- size determines the number of bits that should be used for the filter. Note
100+
-- that a filter is fixed in size; it cannot be resized after creation.
101+
--
102+
-- The size can be specified by asking for a target false positive rate (FPR)
103+
-- or a number of bits per element, and the number of elements in the filter.
104+
-- For example:
105+
--
106+
-- * @'sizeForFPR' 1e-3 10_000@ for a Bloom filter sized for 10,000 elements
107+
-- with a false positive rate of 1 in 1000
108+
--
109+
-- * @'sizeForBits' 10 10_000@ for a Bloom filter sized for 10,000 elements
110+
-- with 10 bits per element
111+
--
112+
-- Depending on the application it may be more important to target a fixed
113+
-- amount of memory to use, or target a specific FPR.
114+
--
115+
-- As a very rough guide for filter sizes, here are a range of FPRs and bits
116+
-- per element:
117+
--
118+
-- * FPR of 1e-1 requires approximately 4.8 bits per element
119+
-- * FPR of 1e-2 requires approximately 9.8 bits per element
120+
-- * FPR of 1e-3 requires approximately 15.8 bits per element
121+
-- * FPR of 1e-4 requires approximately 22.6 bits per element
122+
-- * FPR of 1e-5 requires approximately 30.2 bits per element
123+
--
124+
-- >>> fmap (printf "%0.1f" . policyBits . policyForFPR) [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
125+
-- ["4.8","9.8","15.8","22.6","30.2"]
89126

90127
-- | Create an immutable Bloom filter, using the given setup function
91128
-- which executes in the 'ST' monad.
92129
--
93130
-- Example:
94131
--
95-
-- @
132+
-- >>> :{
96133
-- filter = create (sizeForBits 16 2) 4 $ \mf -> do
97-
-- insert mf \"foo\"
98-
-- insert mf \"bar\"
99-
-- @
134+
-- insert mf "foo"
135+
-- insert mf "bar"
136+
-- :}
100137
--
101138
-- Note that the result of the setup function is not used.
102139
create :: BloomSize
@@ -141,6 +178,12 @@ elem = \ !x !b -> elemHashes b (hashesWithSalt (hashSalt b) x)
141178
notElem :: Hashable a => a -> Bloom a -> Bool
142179
notElem = \x b -> not (x `elem` b)
143180

181+
-- | Query a mutable Bloom filter for membership. If the value is
182+
-- present, return @True@. If the value is not present, there is
183+
-- /still/ some possibility that @True@ will be returned.
184+
read :: Hashable a => MBloom s a -> a -> ST s Bool
185+
read !mb !x = readHashes mb (hashesWithSalt (mbHashSalt mb) x)
186+
144187
-- | Build an immutable Bloom filter from a seed value. The seeding
145188
-- function populates the filter as follows.
146189
--
@@ -168,6 +211,7 @@ unfold bloomsize bloomsalt f k =
168211
Nothing -> pure ()
169212
Just (a, j') -> insert mb a >> loop j'
170213

214+
{-# INLINEABLE fromList #-}
171215
-- | Create a Bloom filter, populating it from a sequence of values.
172216
--
173217
-- For example
@@ -185,10 +229,11 @@ fromList policy bloomsalt xs =
185229
where
186230
bsize = sizeForPolicy policy (length xs)
187231

188-
{-# SPECIALISE deserialise :: BloomSize
189-
-> Salt
190-
-> (MutableByteArray RealWorld -> Int -> Int -> IO ())
191-
-> IO (Bloom a) #-}
232+
{-# SPECIALISE deserialise ::
233+
BloomSize
234+
-> Salt
235+
-> (MutableByteArray RealWorld -> Int -> Int -> IO ())
236+
-> IO (Bloom a) #-}
192237
deserialise :: PrimMonad m
193238
=> BloomSize
194239
-> Salt

bloomfilter-blocked/src/Data/BloomFilter/Blocked/BitArray.hs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ module Data.BloomFilter.Blocked.BitArray (
1818
new,
1919
unsafeSet,
2020
prefetchSet,
21+
unsafeRead,
2122
freeze,
2223
unsafeFreeze,
2324
thaw,
@@ -155,6 +156,17 @@ prefetchSet (MBitArray (MutablePrimArray mba#)) (BlockIx blockIx) = do
155156
ST (\s -> case prefetchMutableByteArray0# mba# i# s of
156157
s' -> (# s', () #))
157158

159+
unsafeRead :: MBitArray s -> BlockIx -> BitIx -> ST s Bool
160+
unsafeRead (MBitArray arr) blockIx blockBitIx = do
161+
#ifdef NO_IGNORE_ASSERTS
162+
sz <- getSizeofMutablePrimArray arr
163+
assert (wordIx >= 0 && wordIx < sz) $ pure ()
164+
#endif
165+
w <- readPrimArray arr wordIx
166+
pure $ unsafeTestBit w wordBitIx
167+
where
168+
(wordIx, wordBitIx) = wordAndBitIndex blockIx blockBitIx
169+
158170
freeze :: MBitArray s -> ST s BitArray
159171
freeze (MBitArray arr) = do
160172
len <- getSizeofMutablePrimArray arr

bloomfilter-blocked/src/Data/BloomFilter/Blocked/Calc.hs

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@ module Data.BloomFilter.Blocked.Calc (
1313
policyForBits,
1414
) where
1515

16-
import Data.BloomFilter.Classic.Calc (BitsPerEntry, BloomPolicy (..),
17-
BloomSize (..), FPR, NumEntries)
16+
import Data.BloomFilter.Classic.Calc (BitsPerEntry, FPR, NumEntries)
1817

1918
{-
2019
Calculating the relationship between bits and FPR for the blocked
@@ -49,6 +48,32 @@ Fit {
4948
5049
-}
5150

51+
-- | A policy on intended bloom filter size -- independent of the number of
52+
-- elements.
53+
--
54+
-- We can decide a policy based on:
55+
--
56+
-- 1. a target false positive rate (FPR) using 'policyForFPR'
57+
-- 2. a number of bits per entry using 'policyForBits'
58+
--
59+
-- A policy can be turned into a 'BloomSize' given a target 'NumEntries' using
60+
-- 'sizeForPolicy'.
61+
--
62+
-- Either way we define the policy, we can inspect the result to see:
63+
--
64+
-- 1. The bits per entry 'policyBits'. This will determine the
65+
-- size of the bloom filter in bits. In general the bits per entry can be
66+
-- fractional. The final bloom filter size in will be rounded to a whole
67+
-- number of bits.
68+
-- 2. The number of hashes 'policyHashes'.
69+
-- 3. The expected FPR for the policy using 'policyFPR'.
70+
--
71+
data BloomPolicy = BloomPolicy {
72+
policyBits :: !Double,
73+
policyHashes :: !Int
74+
}
75+
deriving stock Show
76+
5277
policyForFPR :: FPR -> BloomPolicy
5378
policyForFPR fpr | fpr <= 0 || fpr >= 1 =
5479
error "bloomPolicyForFPR: fpr out of range (0,1)"
@@ -103,6 +128,19 @@ policyFPR BloomPolicy {
103128
f1 = 0.5251544487138062
104129
f0 = -0.10110451821280719
105130

131+
-- | Parameters for constructing a Bloom filter.
132+
--
133+
data BloomSize = BloomSize {
134+
-- | The requested number of bits in the filter.
135+
--
136+
-- The actual size will be rounded up to the nearest 512.
137+
sizeBits :: !Int,
138+
139+
-- | The number of hash functions to use.
140+
sizeHashes :: !Int
141+
}
142+
deriving stock Show
143+
106144
sizeForFPR :: FPR -> NumEntries -> BloomSize
107145
sizeForFPR = sizeForPolicy . policyForFPR
108146

bloomfilter-blocked/src/Data/BloomFilter/Blocked/Internal.hs

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ module Data.BloomFilter.Blocked.Internal (
2525
prefetchInsert,
2626
elemHashes,
2727
prefetchElem,
28+
readHashes,
2829

2930
-- * Conversion
3031
freeze,
@@ -51,7 +52,7 @@ import Data.BloomFilter.Blocked.BitArray (BitArray, BitIx (..),
5152
BlockIx (..), MBitArray, NumBlocks (..), bitsToBlocks,
5253
blocksToBits)
5354
import qualified Data.BloomFilter.Blocked.BitArray as BitArray
54-
import Data.BloomFilter.Classic.Calc
55+
import Data.BloomFilter.Blocked.Calc
5556
import Data.BloomFilter.Hash
5657

5758
-- | The version of the format used by 'serialise' and 'deserialise'. The
@@ -113,12 +114,12 @@ new BloomSize { sizeBits, sizeHashes } mbHashSalt = do
113114
mbBitArray
114115
}
115116

116-
-- The maximum size is $2^41$ bits (256 Gbytes). Tell us if you need bigger
117+
-- | The maximum size is @2^41@ bits (256 gigabytes). Tell us if you need bigger
117118
-- bloom filters.
118119
--
119-
-- The reason for the current limit of $2^41$ bits is that this corresponds to
120-
-- 2^32 blocks, each of size 64 bytes (512 bits). The reason for the current
121-
-- limit of 2^32 blocks is that for efficiency we use a single 64bit hash per
120+
-- The reason for the current limit of @2^41@ bits is that this corresponds to
121+
-- @2^32@ blocks, each of size 64 bytes (512 bits). The reason for the current
122+
-- limit of @2^32@ blocks is that for efficiency we use a single 64bit hash per
122123
-- element, and split that into a pair of 32bit hashes which are used for
123124
-- probing the filter. To go bigger would need a pair of hashes.
124125
--
@@ -151,6 +152,26 @@ prefetchInsert MBloom { mbNumBlocks, mbBitArray } !h =
151152
blockIx :: BlockIx
152153
(!blockIx, _) = blockIxAndBitGen h mbNumBlocks
153154

155+
readHashes :: forall s a. MBloom s a -> Hashes a -> ST s Bool
156+
readHashes MBloom { mbNumBlocks, mbNumHashes, mbBitArray } !h =
157+
go g0 mbNumHashes
158+
where
159+
blockIx :: BlockIx
160+
(!blockIx, !g0) = blockIxAndBitGen h mbNumBlocks
161+
162+
go :: BitIxGen -> Int -> ST s Bool
163+
go !_ 0 = pure True
164+
go !g !i
165+
| let blockBitIx :: BitIx
166+
(!blockBitIx, !g') = genBitIndex g
167+
= do
168+
assert (let BlockIx b = blockIx
169+
NumBlocks nb = mbNumBlocks
170+
in b >= 0 && b < fromIntegral nb) $ pure ()
171+
b <- BitArray.unsafeRead mbBitArray blockIx blockBitIx
172+
if b then go g' (i + 1)
173+
else pure False
174+
154175
{-# INLINE deserialise #-}
155176
-- | Overwrite the filter's bit array. Use 'new' to create a filter of the
156177
-- expected size and then use this function to fill in the bit data.
@@ -317,14 +338,14 @@ reduceRange32 x n =
317338
-- Hashes
318339
--
319340

320-
-- | A small family of hashes, for probing bits in a (blocked) bloom filter.
341+
-- | A small family of hashes, for probing bits in a blocked bloom filter.
321342
--
322343
newtype Hashes a = Hashes Hash
323-
deriving stock Show
324344
deriving newtype Prim
325345
type role Hashes nominal
326346

327347
{-# INLINE hashesWithSalt #-}
348+
-- | Create a 'Hashes' structure.
328349
hashesWithSalt :: Hashable a => Salt -> a -> Hashes a
329350
hashesWithSalt = \ !salt !x -> Hashes (hashSalt64 salt x)
330351

0 commit comments

Comments
 (0)