|
3 | 3 | -- implementation, import "Data.BloomFilter.Blocked".
|
4 | 4 | module Data.BloomFilter (
|
5 | 5 | module Data.BloomFilter.Classic
|
| 6 | + -- * Example: a spelling checker |
| 7 | + -- $example |
| 8 | + |
| 9 | + -- * Differences with the @bloomfilter@ package |
| 10 | + -- $differences |
6 | 11 | ) where
|
7 | 12 |
|
8 | 13 | import Data.BloomFilter.Classic
|
| 14 | + |
| 15 | +-- $example |
| 16 | +-- |
| 17 | +-- This example reads a dictionary file containing one word per line, |
| 18 | +-- constructs a Bloom filter with a 1% false positive rate, and |
| 19 | +-- spellchecks its standard input. Like the Unix @spell@ command, it |
| 20 | +-- prints each word that it does not recognize. |
| 21 | +-- |
| 22 | +-- >>> import Control.Monad (forM_) |
| 23 | +-- >>> import System.Environment (getArgs) |
| 24 | +-- >>> import qualified Data.BloomFilter as B |
| 25 | +-- |
| 26 | +-- >>> :{ |
| 27 | +-- main :: IO () |
| 28 | +-- main = do |
| 29 | +-- files <- getArgs |
| 30 | +-- dictionary <- readFile "/usr/share/dict/words" |
| 31 | +-- let !bloom = B.fromList (B.policyForFPR 0.01) 4 (words dictionary) |
| 32 | +-- forM_ files $ \file -> |
| 33 | +-- putStrLn . unlines . filter (`B.notElem` bloom) . words |
| 34 | +-- =<< readFile file |
| 35 | +-- :} |
| 36 | + |
| 37 | +-- $differences |
| 38 | +-- |
| 39 | +-- This package is an entirely rewritten fork of the |
| 40 | +-- [bloomfilter](https://hackage.haskell.org/package/bloomfilter) package. |
| 41 | +-- |
| 42 | +-- The main differences are |
| 43 | +-- |
| 44 | +-- * Support for both classic and \"blocked\" Bloom filters. Blocked-structured |
| 45 | +-- Bloom filters arrange all the bits for each insert or lookup into a single |
| 46 | +-- cache line, which greatly reduces the number of slow uncached memory reads. |
| 47 | +-- The trade-off for this performance optimisation is a slightly worse |
| 48 | +-- trade-off between bits per element and the FPR. In practice for typical |
| 49 | +-- FPRs of @1-e3@ up to @1e-4@, this requires a couple extra bits per element. |
| 50 | +-- |
| 51 | +-- * This package support Bloom filters of arbitrary sizes (not limited to powers |
| 52 | +-- of two). |
| 53 | +-- |
| 54 | +-- * Sizes over @2^32@ are supported up to @2^48@ for classic Bloom filters and |
| 55 | +-- @2^41@ for blocked Bloom filters. |
| 56 | +-- |
| 57 | +-- * The 'Bloom' and 'MBloom' types are parametrised over a 'Hashable' type |
| 58 | +-- class, instead of having a @a -> ['Hash']@ typed field. |
| 59 | +-- This separation allows clean (de-)serialisation of Bloom filters in this |
| 60 | +-- package, as the hashing scheme is static. |
| 61 | +-- |
| 62 | +-- * [@XXH3@ hash](https://xxhash.com/) is used instead of [Jenkins' |
| 63 | +-- @lookup3@](https://en.wikipedia.org/wiki/Jenkins_hash_function#lookup3). |
0 commit comments