@@ -89,6 +89,79 @@ import Data.BloomFilter.Hash
89
89
90
90
import Prelude hiding (elem , notElem , read )
91
91
92
+ -- $overview
93
+ --
94
+ -- Each of the functions for creating Bloom filters accepts a 'BloomSize'. The
95
+ -- size determines the number of bits that should be used for the filter. Note
96
+ -- that a filter is fixed in size; it cannot be resized after creation.
97
+ --
98
+ -- The size can be specified by asking for a target false positive rate (FPR)
99
+ -- or a number of bits per element, and the number of elements in the filter.
100
+ -- For example:
101
+ --
102
+ -- * @'sizeForFPR' 1e-3 10_000@ for a Bloom filter sized for 10,000 elements
103
+ -- with a false positive rate of 1 in 1000
104
+ --
105
+ -- * @'sizeForBits' 10 10_000@ for a Bloom filter sized for 10,000 elements
106
+ -- with 10 bits per element
107
+ --
108
+ -- Depending on the application it may be more important to target a fixed
109
+ -- amount of memory to use, or target a specific FPR.
110
+ --
111
+ -- As a very rough guide for filter sizes, here are a range of FPRs and bits
112
+ -- per element:
113
+ --
114
+ -- * FPR of 1e-1 requires approximately 4.8 bits per element
115
+ -- * FPR of 1e-2 requires approximately 9.6 bits per element
116
+ -- * FPR of 1e-3 requires approximately 14.4 bits per element
117
+ -- * FPR of 1e-4 requires approximately 19.2 bits per element
118
+ -- * FPR of 1e-5 requires approximately 24.0 bits per element
119
+
120
+
121
+ -- $example
122
+ --
123
+ -- This example reads a dictionary file containing one word per line,
124
+ -- constructs a Bloom filter with a 1% false positive rate, and
125
+ -- spellchecks its standard input. Like the Unix @spell@ command, it
126
+ -- prints each word that it does not recognize.
127
+ --
128
+ -- @
129
+ -- import Data.Maybe (mapMaybe)
130
+ -- import qualified Data.BloomFilter as B
131
+ --
132
+ -- main = do
133
+ -- filt \<- B.fromList (B.policyForFPR 0.01) . words \<$> readFile "\/usr\/share\/dict\/words"
134
+ -- let check word | B.elem word filt = Nothing
135
+ -- | otherwise = Just word
136
+ -- interact (unlines . mapMaybe check . lines)
137
+ -- @
138
+
139
+
140
+ -- $differences
141
+ --
142
+ -- This package is an entirely rewritten fork of
143
+ -- [bloomfilter](https://hackage.haskell.org/package/bloomfilter) package.
144
+ --
145
+ -- The main differences are
146
+ --
147
+ -- * This packages support bloomfilters of arbitrary sizes
148
+ -- (not limited to powers of two). Also sizes over 2^32 are supported.
149
+ --
150
+ -- * The 'Bloom' and 'MBloom' types are parametrised over a 'Hashable' type
151
+ -- class, instead of having a @a -> ['Hash']@ typed field.
152
+ -- This separation allows clean de\/serialization of Bloom filters in this
153
+ -- package, as the hashing scheme is static.
154
+ --
155
+ -- * [@XXH3@ hash](https://xxhash.com/) is used instead of Jenkins'
156
+ -- @lookup3@.
157
+ --
158
+ -- * Support for both classic and \"blocked\" Bloom filters. Blocked-structured
159
+ -- Bloom filters arrange all the bits for each insert or lookup into a single
160
+ -- cache line, which greatly reduces the number of slow uncached memory reads.
161
+ -- The trade-off for this performance optimisation is a slightly worse
162
+ -- trade-off between bits per element and the FPR. In practice for typical
163
+ -- FPRs of 1-e3 -- 1e-4, this requires a couple extra bits per element.
164
+
92
165
-- | Create an immutable Bloom filter, using the given setup function
93
166
-- which executes in the 'ST' monad.
94
167
--
@@ -205,75 +278,3 @@ deserialise bloomsalt bloomsize fill = do
205
278
mbloom <- stToPrim $ new bloomsalt bloomsize
206
279
Internal. deserialise mbloom fill
207
280
stToPrim $ unsafeFreeze mbloom
208
-
209
- -- $overview
210
- --
211
- -- Each of the functions for creating Bloom filters accepts a 'BloomSize'. The
212
- -- size determines the number of bits that should be used for the filter. Note
213
- -- that a filter is fixed in size; it cannot be resized after creation.
214
- --
215
- -- The size can be specified by asking for a target false positive rate (FPR)
216
- -- or a number of bits per element, and the number of elements in the filter.
217
- -- For example:
218
- --
219
- -- * @'sizeForFPR' 1e-3 10_000@ for a Bloom filter sized for 10,000 elements
220
- -- with a false positive rate of 1 in 1000
221
- --
222
- -- * @'sizeForBits' 10 10_000@ for a Bloom filter sized for 10,000 elements
223
- -- with 10 bits per element
224
- --
225
- -- Depending on the application it may be more important to target a fixed
226
- -- amount of memory to use, or target a specific FPR.
227
- --
228
- -- As a very rough guide for filter sizes, here are a range of FPRs and bits
229
- -- per element:
230
- --
231
- -- * FPR of 1e-1 requires approximately 4.8 bits per element
232
- -- * FPR of 1e-2 requires approximately 9.6 bits per element
233
- -- * FPR of 1e-3 requires approximately 14.4 bits per element
234
- -- * FPR of 1e-4 requires approximately 19.2 bits per element
235
- -- * FPR of 1e-5 requires approximately 24.0 bits per element
236
- --
237
-
238
- -- $example
239
- --
240
- -- This example reads a dictionary file containing one word per line,
241
- -- constructs a Bloom filter with a 1% false positive rate, and
242
- -- spellchecks its standard input. Like the Unix @spell@ command, it
243
- -- prints each word that it does not recognize.
244
- --
245
- -- @
246
- -- import Data.Maybe (mapMaybe)
247
- -- import qualified Data.BloomFilter as B
248
- --
249
- -- main = do
250
- -- filt \<- B.fromList (B.policyForFPR 0.01) . words \<$> readFile "\/usr\/share\/dict\/words"
251
- -- let check word | B.elem word filt = Nothing
252
- -- | otherwise = Just word
253
- -- interact (unlines . mapMaybe check . lines)
254
- -- @
255
-
256
- -- $differences
257
- --
258
- -- This package is an entirely rewritten fork of
259
- -- [bloomfilter](https://hackage.haskell.org/package/bloomfilter) package.
260
- --
261
- -- The main differences are
262
- --
263
- -- * This packages support bloomfilters of arbitrary sizes
264
- -- (not limited to powers of two). Also sizes over 2^32 are supported.
265
- --
266
- -- * The 'Bloom' and 'MBloom' types are parametrised over a 'Hashable' type
267
- -- class, instead of having a @a -> ['Hash']@ typed field.
268
- -- This separation allows clean de\/serialization of Bloom filters in this
269
- -- package, as the hashing scheme is static.
270
- --
271
- -- * [@XXH3@ hash](https://xxhash.com/) is used instead of Jenkins'
272
- -- @lookup3@.
273
- --
274
- -- * Support for both classic and \"blocked\" Bloom filters. Blocked-structured
275
- -- Bloom filters arrange all the bits for each insert or lookup into a single
276
- -- cache line, which greatly reduces the number of slow uncached memory reads.
277
- -- The trade-off for this performance optimisation is a slightly worse
278
- -- trade-off between bits per element and the FPR. In practice for typical
279
- -- FPRs of 1-e3 -- 1e-4, this requires a couple extra bits per element.
0 commit comments