diff --git a/bloomfilter-blocked/CHANGELOG.md b/bloomfilter-blocked/CHANGELOG.md new file mode 100644 index 000000000..05c7a746c --- /dev/null +++ b/bloomfilter-blocked/CHANGELOG.md @@ -0,0 +1,5 @@ +# Revision history for bloomfilter-blocked + +## 0.1.0.0 -- YYYY-mm-dd + +* First version. Released on an unsuspecting world. diff --git a/bloomfilter-blocked/LICENSE b/bloomfilter-blocked/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/bloomfilter-blocked/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/bloomfilter-blocked/NOTICE b/bloomfilter-blocked/NOTICE new file mode 100644 index 000000000..82751f478 --- /dev/null +++ b/bloomfilter-blocked/NOTICE @@ -0,0 +1,13 @@ +Copyright 2023 Input Output Global, Inc. (IOG), 2023-2025 INTERSECT. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/bloomfilter-blocked/README.md b/bloomfilter-blocked/README.md new file mode 100644 index 000000000..a834de183 --- /dev/null +++ b/bloomfilter-blocked/README.md @@ -0,0 +1,75 @@ +# bloomfilter-blocked + +`bloomfilter-blocked` is a Haskell library providing multiple fast and efficient +implementations of [bloom filters][bloom-filter:wiki]. It is a full rewrite of +the [`bloomfilter`][bloomfilter:hackage] package, originally authored by Bryan +O'Sullivan . + +A bloom filter is a space-efficient data structure representing a set that can +be probablistically queried for set membership. The set membership query returns +no false negatives, but it might return false positives. That is, if an element +was added to a bloom filter, then a subsequent query definitely returns `True`. +If an element was *not* added to a filter, then a subsequent query may still +return `True` if `False` would be the correct answer. The probabiliy of false +positives -- the false positive rate (FPR) -- is configurable, as we will +describe later. + +The library includes two implementations of bloom filters: classic, and blocked. + +* **Classic** bloom filters, found in the `Data.BloomFilter.Classic` module: a + default implementation that is faithful to the canonical description of a + bloom filter data structure. + +* **Blocked** floom filters, found in the `Data.BloomFilter.Blocked` module: an + implementation that optimises the memory layout of a classic bloom filter for + speed (cheaper CPU cache reads), at the cost of a slightly higher FPR for the + same amount of assigned memory. + +The FPR scales inversely with how much memory is assigned to the filter. It also +scales inversely with how many elements are added to the set. The user can +configure how much memory is asisgned to a filter, and the user also controls +how many elements are added to a set. Each implementation comes with helper +functions, like `sizeForFPR` and `sizeForBits`, that the user can leverage to +configure filters. + +Both immutable (`Bloom`) and mutable (`MBloom`) bloom filters, including +functions to convert between the two, are provided for each implementation. Note +however that a (mutable) bloom filter can not be resized once created, and that +elements can not be deleted once inserted. + +For more information about the library and examples of how to use it, see the +Haddock documentation of the different modules. + +# Usage notes + +User should take into account the following: + +* This package is not supported on 32bit systems. + +# Differences from the `bloomfilter` package + +The library is a full rewrite of the [`bloomfilter`][bloomfilter:hackage] +package, originally authored by Bryan O'Sullivan . The main +differences are: + +* `bloomfilter-blocked` supports both classic and blocked bloom filters, whereas + `bloomfilter` only supports the former. +* `bloomfilter-blocked` supports bloom filters of arbitrary sizes, whereas + `bloomfilter` limits the sizes to powers of two. +* `bloomfilter-blocked` supports sizes up to `2^48` for classic bloom filters + and up to `2^41` for blocked bloom filters, instead of `2^32`. +* In `bloomfilter-blocked`, the `Bloom` and `MBloom` types are parameterised + over a `Hashable` type class, instead of having a `a -> [Hash]` typed field. + This separation in `bloomfilter-blocked` allows clean (de-)serialisation of + filters as the hashing scheme is static. +* `bloomfilter-blocked` uses [`XXH3`][xxh3] for hashing instead of [Jenkins' + `lookup3`][lookup3:wiki], which `bloomfilter` uses. +* The user can configure hash salts for improved security in + `bloomfilter-blocked`, whereas this is not supported in `bloomfilter`. + + + +[bloom-filter:wiki]: https://en.wikipedia.org/wiki/Bloom_filter +[bloomfilter:hackage]: https://hackage.haskell.org/package/bloomfilter +[xxh3]: https://xxhash.com/ +[lookup3:wiki]: https://en.wikipedia.org/wiki/Jenkins_hash_function#lookup3 \ No newline at end of file diff --git a/bloomfilter/bench/bloomfilter-bench.hs b/bloomfilter-blocked/bench/bloomfilter-bench.hs similarity index 96% rename from bloomfilter/bench/bloomfilter-bench.hs rename to bloomfilter-blocked/bench/bloomfilter-bench.hs index ceb2d58b1..3df2919ea 100644 --- a/bloomfilter/bench/bloomfilter-bench.hs +++ b/bloomfilter-blocked/bench/bloomfilter-bench.hs @@ -1,9 +1,8 @@ -module Main where +module Main (main) where import Criterion.Main (bench, bgroup, defaultMain, env, whnf) import qualified Data.BloomFilter.Blocked as B.Blocked import qualified Data.BloomFilter.Classic as B.Classic -import Data.BloomFilter.Hash (Hashable (..)) import Data.Word (Word64) import System.Random (StdGen, newStdGen, uniform) diff --git a/bloomfilter-blocked/bloomfilter-blocked.cabal b/bloomfilter-blocked/bloomfilter-blocked.cabal new file mode 100644 index 000000000..881e007b8 --- /dev/null +++ b/bloomfilter-blocked/bloomfilter-blocked.cabal @@ -0,0 +1,176 @@ +cabal-version: 3.4 +name: bloomfilter-blocked +version: 0.1.0.0 +synopsis: Classic and block-style bloom filters +description: Classic and block-style bloom filters. +license: Apache-2.0 +license-files: + LICENSE + NOTICE + +author: + Duncan Coutts, Joris Dral, Matthias Heinzel, Wolfgang Jeltsch, Wen Kokke, and Alex Washburn + +maintainer: duncan@well-typed.com, joris@well-typed.com +copyright: + (c) 2023 Input Output Global, Inc. (IOG) + (c) 2023-2025 INTERSECT + +category: Data +build-type: Simple +tested-with: + GHC ==9.2 || ==9.4 || ==9.6 || ==9.8 || ==9.10 || ==9.12 + +extra-doc-files: + CHANGELOG.md + README.md + +extra-source-files: + xxhash/include/HsXXHash.h + xxhash/xxHash-0.8.2/xxhash.h + +license-files: xxhash/xxHash-0.8.2/LICENSE-xxHash + +source-repository head + type: git + location: https://github.com/IntersectMBO/lsm-tree + subdir: bloomfilter-blocked + +source-repository this + type: git + location: https://github.com/IntersectMBO/lsm-tree + subdir: bloomfilter-blocked + tag: bloomfilter-blocked-0.1.0.0 + +common warnings + ghc-options: + -Wall -Wcompat -Wincomplete-uni-patterns + -Wincomplete-record-updates -Wpartial-fields -Widentities + -Wredundant-constraints -Wmissing-export-lists + -Wno-unticked-promoted-constructors -Wunused-packages + + ghc-options: -Werror=missing-deriving-strategies + +common language + default-language: GHC2021 + default-extensions: + DeriveAnyClass + DerivingStrategies + DerivingVia + ExplicitNamespaces + GADTs + LambdaCase + RecordWildCards + RoleAnnotations + ViewPatterns + +library + import: language, warnings + hs-source-dirs: src + build-depends: + , base >=4.16 && <4.22 + , bloomfilter-blocked:xxhash + , bytestring ^>=0.11 || ^>=0.12 + , deepseq ^>=1.4 || ^>=1.5 + , primitive ^>=0.9 + + exposed-modules: + Data.BloomFilter + Data.BloomFilter.Blocked + Data.BloomFilter.Classic + Data.BloomFilter.Hash + + other-modules: + Data.BloomFilter.Blocked.BitArray + Data.BloomFilter.Blocked.Calc + Data.BloomFilter.Blocked.Internal + Data.BloomFilter.Classic.BitArray + Data.BloomFilter.Classic.Calc + Data.BloomFilter.Classic.Internal + + ghc-options: -O2 + +test-suite tests + import: language, warnings + type: exitcode-stdio-1.0 + hs-source-dirs: tests + main-is: bloomfilter-tests.hs + build-depends: + , base <5 + , bloomfilter-blocked + , bytestring + , quickcheck-instances + , tasty + , tasty-quickcheck + +benchmark bench + import: language, warnings + type: exitcode-stdio-1.0 + hs-source-dirs: bench + main-is: bloomfilter-bench.hs + build-depends: + , base + , bloomfilter-blocked + , criterion + , random + +executable fpr-calc + import: language, warnings + scope: private + hs-source-dirs: tests + main-is: fpr-calc.hs + build-depends: + , base + , bloomfilter-blocked + , containers + , parallel + , random + , regression-simple + + ghc-options: -threaded + +executable spell + import: language, warnings + scope: private + hs-source-dirs: examples + main-is: spell.hs + build-depends: + , base + , bloomfilter-blocked + +-- this exists due to windows +library xxhash + import: language, warnings + visibility: private + include-dirs: xxhash/xxHash-0.8.2/ xxhash/include/ + includes: + HsXXHash.h + xxhash.h + + exposed-modules: XXH3 + + if (arch(x86_64) && !os(osx)) + -- Cabal doesn't pass cc-options to "ordinary" Haskell source compilation + -- https://github.com/haskell/cabal/issues/9801 + ghc-options: -optc=-mavx2 -optc=-O3 + + other-modules: FFI + hs-source-dirs: xxhash/src + build-depends: + , base <5 + , bytestring + , primitive ^>=0.9 + +test-suite xxhash-tests + import: language, warnings + type: exitcode-stdio-1.0 + hs-source-dirs: xxhash/tests + main-is: xxhash-tests.hs + build-depends: + , base <5 + , bloomfilter-blocked:xxhash + , bytestring + , primitive + , tasty + , tasty-hunit + , tasty-quickcheck diff --git a/bloomfilter/examples/spell.hs b/bloomfilter-blocked/examples/spell.hs similarity index 91% rename from bloomfilter/examples/spell.hs rename to bloomfilter-blocked/examples/spell.hs index 8a3bc1957..fe144122a 100644 --- a/bloomfilter/examples/spell.hs +++ b/bloomfilter-blocked/examples/spell.hs @@ -1,7 +1,7 @@ {-# LANGUAGE BangPatterns #-} module Main (main) where -import Control.Monad (forM_, when) +import Control.Monad (forM_) import System.Environment (getArgs) import qualified Data.BloomFilter as B diff --git a/bloomfilter/fpr.blocked.gnuplot.data b/bloomfilter-blocked/fpr.blocked.gnuplot.data similarity index 100% rename from bloomfilter/fpr.blocked.gnuplot.data rename to bloomfilter-blocked/fpr.blocked.gnuplot.data diff --git a/bloomfilter/fpr.classic.gnuplot.data b/bloomfilter-blocked/fpr.classic.gnuplot.data similarity index 100% rename from bloomfilter/fpr.classic.gnuplot.data rename to bloomfilter-blocked/fpr.classic.gnuplot.data diff --git a/bloomfilter/fpr.gnuplot b/bloomfilter-blocked/fpr.gnuplot similarity index 100% rename from bloomfilter/fpr.gnuplot rename to bloomfilter-blocked/fpr.gnuplot diff --git a/bloomfilter/fpr.png b/bloomfilter-blocked/fpr.png similarity index 100% rename from bloomfilter/fpr.png rename to bloomfilter-blocked/fpr.png diff --git a/bloomfilter-blocked/src/Data/BloomFilter.hs b/bloomfilter-blocked/src/Data/BloomFilter.hs new file mode 100644 index 000000000..4afe1198b --- /dev/null +++ b/bloomfilter-blocked/src/Data/BloomFilter.hs @@ -0,0 +1,63 @@ +-- | By default, this module re-exports the classic bloom filter implementation +-- from "Data.BloomFilter.Classic". If you want to use the blocked bloom filter +-- implementation, import "Data.BloomFilter.Blocked". +module Data.BloomFilter ( + module Data.BloomFilter.Classic + -- * Example: a spelling checker + -- $example + + -- * Differences with the @bloomfilter@ package + -- $differences + ) where + +import Data.BloomFilter.Classic + +-- $example +-- +-- This example reads a dictionary file containing one word per line, +-- constructs a Bloom filter with a 1% false positive rate, and +-- spellchecks its standard input. Like the Unix @spell@ command, it +-- prints each word that it does not recognize. +-- +-- >>> import Control.Monad (forM_) +-- >>> import System.Environment (getArgs) +-- >>> import qualified Data.BloomFilter as B +-- +-- >>> :{ +-- main :: IO () +-- main = do +-- files <- getArgs +-- dictionary <- readFile "/usr/share/dict/words" +-- let !bloom = B.fromList (B.policyForFPR 0.01) 4 (words dictionary) +-- forM_ files $ \file -> +-- putStrLn . unlines . filter (`B.notElem` bloom) . words +-- =<< readFile file +-- :} + +-- $differences +-- +-- This package is an entirely rewritten fork of the +-- [bloomfilter](https://hackage.haskell.org/package/bloomfilter) package. +-- +-- The main differences are +-- +-- * Support for both classic and \"blocked\" Bloom filters. Blocked-structured +-- Bloom filters arrange all the bits for each insert or lookup into a single +-- cache line, which greatly reduces the number of slow uncached memory reads. +-- The trade-off for this performance optimisation is a slightly worse +-- trade-off between bits per element and the FPR. In practice for typical +-- FPRs of @1-e3@ up to @1e-4@, this requires a couple extra bits per element. +-- +-- * This package support Bloom filters of arbitrary sizes (not limited to powers +-- of two). +-- +-- * Sizes over @2^32@ are supported up to @2^48@ for classic Bloom filters and +-- @2^41@ for blocked Bloom filters. +-- +-- * The 'Bloom' and 'MBloom' types are parametrised over a 'Hashable' type +-- class, instead of having a @a -> ['Hash']@ typed field. +-- This separation allows clean (de-)serialisation of Bloom filters in this +-- package, as the hashing scheme is static. +-- +-- * [@XXH3@ hash](https://xxhash.com/) is used instead of [Jenkins' +-- @lookup3@](https://en.wikipedia.org/wiki/Jenkins_hash_function#lookup3). diff --git a/bloomfilter/src/Data/BloomFilter/Blocked.hs b/bloomfilter-blocked/src/Data/BloomFilter/Blocked.hs similarity index 74% rename from bloomfilter/src/Data/BloomFilter/Blocked.hs rename to bloomfilter-blocked/src/Data/BloomFilter/Blocked.hs index a94a3d876..59adac9a7 100644 --- a/bloomfilter/src/Data/BloomFilter/Blocked.hs +++ b/bloomfilter-blocked/src/Data/BloomFilter/Blocked.hs @@ -1,19 +1,17 @@ --- | +-- | A fast, space efficient Bloom filter implementation. A Bloom filter is a +-- set-like data structure that provides a probabilistic membership test. -- --- A fast, space efficient Bloom filter implementation. A Bloom --- filter is a set-like data structure that provides a probabilistic --- membership test. +-- * Queries do not give false negatives. When an element is added to a filter, +-- a subsequent membership test will definitely return 'True'. -- --- * Queries do not give false negatives. When an element is added to --- a filter, a subsequent membership test will definitely return --- 'True'. +-- * False positives /are/ possible. If an element has not been added to a +-- filter, a membership test /may/ nevertheless indicate that the element is +-- present. -- --- * False positives /are/ possible. If an element has not been added --- to a filter, a membership test /may/ nevertheless indicate that --- the element is present. --- - module Data.BloomFilter.Blocked ( + -- * Overview + -- $overview + -- * Types Hash, Salt, @@ -57,6 +55,7 @@ module Data.BloomFilter.Blocked ( maxSizeBits, insert, insertMany, + read, -- ** Conversion freeze, @@ -68,6 +67,7 @@ module Data.BloomFilter.Blocked ( hashesWithSalt, insertHashes, elemHashes, + readHashes, -- ** Prefetching prefetchInsert, prefetchElem, @@ -80,23 +80,60 @@ import Data.Bits ((.&.)) import Data.Primitive.ByteArray (MutableByteArray) import qualified Data.Primitive.PrimArray as P -import Data.BloomFilter.Blocked.Calc +import Data.BloomFilter.Blocked.Calc (BitsPerEntry, BloomPolicy (..), + BloomSize (..), FPR, NumEntries, policyFPR, policyForBits, + policyForFPR, sizeForBits, sizeForFPR, sizeForPolicy) import Data.BloomFilter.Blocked.Internal hiding (deserialise) import qualified Data.BloomFilter.Blocked.Internal as Internal import Data.BloomFilter.Hash -import Prelude hiding (elem, notElem) +import Prelude hiding (elem, notElem, read) + +-- $setup +-- +-- >>> import Text.Printf + +-- $overview +-- +-- Each of the functions for creating Bloom filters accepts a 'BloomSize'. The +-- size determines the number of bits that should be used for the filter. Note +-- that a filter is fixed in size; it cannot be resized after creation. +-- +-- The size can be specified by asking for a target false positive rate (FPR) +-- or a number of bits per element, and the number of elements in the filter. +-- For example: +-- +-- * @'sizeForFPR' 1e-3 10_000@ for a Bloom filter sized for 10,000 elements +-- with a false positive rate of 1 in 1000 +-- +-- * @'sizeForBits' 10 10_000@ for a Bloom filter sized for 10,000 elements +-- with 10 bits per element +-- +-- Depending on the application it may be more important to target a fixed +-- amount of memory to use, or target a specific FPR. +-- +-- As a very rough guide for filter sizes, here are a range of FPRs and bits +-- per element: +-- +-- * FPR of 1e-1 requires approximately 4.8 bits per element +-- * FPR of 1e-2 requires approximately 9.8 bits per element +-- * FPR of 1e-3 requires approximately 15.8 bits per element +-- * FPR of 1e-4 requires approximately 22.6 bits per element +-- * FPR of 1e-5 requires approximately 30.2 bits per element +-- +-- >>> fmap (printf "%0.1f" . policyBits . policyForFPR) [1e-1, 1e-2, 1e-3, 1e-4, 1e-5] :: [String] +-- ["4.8","9.8","15.8","22.6","30.2"] -- | Create an immutable Bloom filter, using the given setup function -- which executes in the 'ST' monad. -- -- Example: -- --- @ +-- >>> :{ -- filter = create (sizeForBits 16 2) 4 $ \mf -> do --- insert mf \"foo\" --- insert mf \"bar\" --- @ +-- insert mf "foo" +-- insert mf "bar" +-- :} -- -- Note that the result of the setup function is not used. create :: BloomSize @@ -141,6 +178,12 @@ elem = \ !x !b -> elemHashes b (hashesWithSalt (hashSalt b) x) notElem :: Hashable a => a -> Bloom a -> Bool notElem = \x b -> not (x `elem` b) +-- | Query a mutable Bloom filter for membership. If the value is +-- present, return @True@. If the value is not present, there is +-- /still/ some possibility that @True@ will be returned. +read :: Hashable a => MBloom s a -> a -> ST s Bool +read !mb !x = readHashes mb (hashesWithSalt (mbHashSalt mb) x) + -- | Build an immutable Bloom filter from a seed value. The seeding -- function populates the filter as follows. -- @@ -168,6 +211,7 @@ unfold bloomsize bloomsalt f k = Nothing -> pure () Just (a, j') -> insert mb a >> loop j' +{-# INLINEABLE fromList #-} -- | Create a Bloom filter, populating it from a sequence of values. -- -- For example @@ -185,10 +229,11 @@ fromList policy bloomsalt xs = where bsize = sizeForPolicy policy (length xs) -{-# SPECIALISE deserialise :: BloomSize - -> Salt - -> (MutableByteArray RealWorld -> Int -> Int -> IO ()) - -> IO (Bloom a) #-} +{-# SPECIALISE deserialise :: + BloomSize + -> Salt + -> (MutableByteArray RealWorld -> Int -> Int -> IO ()) + -> IO (Bloom a) #-} deserialise :: PrimMonad m => BloomSize -> Salt diff --git a/bloomfilter/src/Data/BloomFilter/Blocked/BitArray.hs b/bloomfilter-blocked/src/Data/BloomFilter/Blocked/BitArray.hs similarity index 94% rename from bloomfilter/src/Data/BloomFilter/Blocked/BitArray.hs rename to bloomfilter-blocked/src/Data/BloomFilter/Blocked/BitArray.hs index d283fe69a..4743d84b1 100644 --- a/bloomfilter/src/Data/BloomFilter/Blocked/BitArray.hs +++ b/bloomfilter-blocked/src/Data/BloomFilter/Blocked/BitArray.hs @@ -18,6 +18,7 @@ module Data.BloomFilter.Blocked.BitArray ( new, unsafeSet, prefetchSet, + unsafeRead, freeze, unsafeFreeze, thaw, @@ -155,6 +156,17 @@ prefetchSet (MBitArray (MutablePrimArray mba#)) (BlockIx blockIx) = do ST (\s -> case prefetchMutableByteArray0# mba# i# s of s' -> (# s', () #)) +unsafeRead :: MBitArray s -> BlockIx -> BitIx -> ST s Bool +unsafeRead (MBitArray arr) blockIx blockBitIx = do +#ifdef NO_IGNORE_ASSERTS + sz <- getSizeofMutablePrimArray arr + assert (wordIx >= 0 && wordIx < sz) $ pure () +#endif + w <- readPrimArray arr wordIx + pure $ unsafeTestBit w wordBitIx + where + (wordIx, wordBitIx) = wordAndBitIndex blockIx blockBitIx + freeze :: MBitArray s -> ST s BitArray freeze (MBitArray arr) = do len <- getSizeofMutablePrimArray arr diff --git a/bloomfilter/src/Data/BloomFilter/Blocked/Calc.hs b/bloomfilter-blocked/src/Data/BloomFilter/Blocked/Calc.hs similarity index 73% rename from bloomfilter/src/Data/BloomFilter/Blocked/Calc.hs rename to bloomfilter-blocked/src/Data/BloomFilter/Blocked/Calc.hs index e1ed5776a..2cf53701a 100644 --- a/bloomfilter/src/Data/BloomFilter/Blocked/Calc.hs +++ b/bloomfilter-blocked/src/Data/BloomFilter/Blocked/Calc.hs @@ -13,8 +13,7 @@ module Data.BloomFilter.Blocked.Calc ( policyForBits, ) where -import Data.BloomFilter.Classic.Calc (BitsPerEntry, BloomPolicy (..), - BloomSize (..), FPR, NumEntries) +import Data.BloomFilter.Classic.Calc (BitsPerEntry, FPR, NumEntries) {- Calculating the relationship between bits and FPR for the blocked @@ -49,6 +48,32 @@ Fit { -} +-- | A policy on intended bloom filter size -- independent of the number of +-- elements. +-- +-- We can decide a policy based on: +-- +-- 1. a target false positive rate (FPR) using 'policyForFPR' +-- 2. a number of bits per entry using 'policyForBits' +-- +-- A policy can be turned into a 'BloomSize' given a target 'NumEntries' using +-- 'sizeForPolicy'. +-- +-- Either way we define the policy, we can inspect the result to see: +-- +-- 1. The bits per entry 'policyBits'. This will determine the +-- size of the bloom filter in bits. In general the bits per entry can be +-- fractional. The final bloom filter size in will be rounded to a whole +-- number of bits. +-- 2. The number of hashes 'policyHashes'. +-- 3. The expected FPR for the policy using 'policyFPR'. +-- +data BloomPolicy = BloomPolicy { + policyBits :: !Double, + policyHashes :: !Int + } + deriving stock Show + policyForFPR :: FPR -> BloomPolicy policyForFPR fpr | fpr <= 0 || fpr >= 1 = error "bloomPolicyForFPR: fpr out of range (0,1)" @@ -103,6 +128,19 @@ policyFPR BloomPolicy { f1 = 0.5251544487138062 f0 = -0.10110451821280719 +-- | Parameters for constructing a Bloom filter. +-- +data BloomSize = BloomSize { + -- | The requested number of bits in the filter. + -- + -- The actual size will be rounded up to the nearest 512. + sizeBits :: !Int, + + -- | The number of hash functions to use. + sizeHashes :: !Int + } + deriving stock Show + sizeForFPR :: FPR -> NumEntries -> BloomSize sizeForFPR = sizeForPolicy . policyForFPR diff --git a/bloomfilter/src/Data/BloomFilter/Blocked/Internal.hs b/bloomfilter-blocked/src/Data/BloomFilter/Blocked/Internal.hs similarity index 90% rename from bloomfilter/src/Data/BloomFilter/Blocked/Internal.hs rename to bloomfilter-blocked/src/Data/BloomFilter/Blocked/Internal.hs index 5dc41cdcb..24e2420da 100644 --- a/bloomfilter/src/Data/BloomFilter/Blocked/Internal.hs +++ b/bloomfilter-blocked/src/Data/BloomFilter/Blocked/Internal.hs @@ -25,6 +25,7 @@ module Data.BloomFilter.Blocked.Internal ( prefetchInsert, elemHashes, prefetchElem, + readHashes, -- * Conversion freeze, @@ -51,7 +52,7 @@ import Data.BloomFilter.Blocked.BitArray (BitArray, BitIx (..), BlockIx (..), MBitArray, NumBlocks (..), bitsToBlocks, blocksToBits) import qualified Data.BloomFilter.Blocked.BitArray as BitArray -import Data.BloomFilter.Classic.Calc +import Data.BloomFilter.Blocked.Calc import Data.BloomFilter.Hash -- | The version of the format used by 'serialise' and 'deserialise'. The @@ -113,12 +114,12 @@ new BloomSize { sizeBits, sizeHashes } mbHashSalt = do mbBitArray } --- The maximum size is $2^41$ bits (256 Gbytes). Tell us if you need bigger +-- | The maximum size is @2^41@ bits (256 gigabytes). Tell us if you need bigger -- bloom filters. -- --- The reason for the current limit of $2^41$ bits is that this corresponds to --- 2^32 blocks, each of size 64 bytes (512 bits). The reason for the current --- limit of 2^32 blocks is that for efficiency we use a single 64bit hash per +-- The reason for the current limit of @2^41@ bits is that this corresponds to +-- @2^32@ blocks, each of size 64 bytes (512 bits). The reason for the current +-- limit of @2^32@ blocks is that for efficiency we use a single 64bit hash per -- element, and split that into a pair of 32bit hashes which are used for -- probing the filter. To go bigger would need a pair of hashes. -- @@ -151,6 +152,26 @@ prefetchInsert MBloom { mbNumBlocks, mbBitArray } !h = blockIx :: BlockIx (!blockIx, _) = blockIxAndBitGen h mbNumBlocks +readHashes :: forall s a. MBloom s a -> Hashes a -> ST s Bool +readHashes MBloom { mbNumBlocks, mbNumHashes, mbBitArray } !h = + go g0 mbNumHashes + where + blockIx :: BlockIx + (!blockIx, !g0) = blockIxAndBitGen h mbNumBlocks + + go :: BitIxGen -> Int -> ST s Bool + go !_ 0 = pure True + go !g !i + | let blockBitIx :: BitIx + (!blockBitIx, !g') = genBitIndex g + = do + assert (let BlockIx b = blockIx + NumBlocks nb = mbNumBlocks + in b >= 0 && b < fromIntegral nb) $ pure () + b <- BitArray.unsafeRead mbBitArray blockIx blockBitIx + if b then go g' (i + 1) + else pure False + {-# INLINE deserialise #-} -- | Overwrite the filter's bit array. Use 'new' to create a filter of the -- expected size and then use this function to fill in the bit data. @@ -317,14 +338,14 @@ reduceRange32 x n = -- Hashes -- --- | A small family of hashes, for probing bits in a (blocked) bloom filter. +-- | A small family of hashes, for probing bits in a blocked bloom filter. -- newtype Hashes a = Hashes Hash - deriving stock Show deriving newtype Prim type role Hashes nominal {-# INLINE hashesWithSalt #-} +-- | Create a 'Hashes' structure. hashesWithSalt :: Hashable a => Salt -> a -> Hashes a hashesWithSalt = \ !salt !x -> Hashes (hashSalt64 salt x) diff --git a/bloomfilter/src/Data/BloomFilter/Classic.hs b/bloomfilter-blocked/src/Data/BloomFilter/Classic.hs similarity index 69% rename from bloomfilter/src/Data/BloomFilter/Classic.hs rename to bloomfilter-blocked/src/Data/BloomFilter/Classic.hs index 15375b532..3f4c57321 100644 --- a/bloomfilter/src/Data/BloomFilter/Classic.hs +++ b/bloomfilter-blocked/src/Data/BloomFilter/Classic.hs @@ -1,25 +1,17 @@ --- | A fast, space efficient Bloom filter implementation. A Bloom --- filter is a set-like data structure that provides a probabilistic --- membership test. +-- | A fast, space efficient Bloom filter implementation. A Bloom filter is a +-- set-like data structure that provides a probabilistic membership test. -- --- * Queries do not give false negatives. When an element is added to --- a filter, a subsequent membership test will definitely return --- 'True'. +-- * Queries do not give false negatives. When an element is added to a filter, +-- a subsequent membership test will definitely return 'True'. -- --- * False positives /are/ possible. If an element has not been added --- to a filter, a membership test /may/ nevertheless indicate that --- the element is present. +-- * False positives /are/ possible. If an element has not been added to a +-- filter, a membership test /may/ nevertheless indicate that the element is +-- present. -- module Data.BloomFilter.Classic ( -- * Overview -- $overview - -- ** Example: a spell checker - -- $example - - -- ** Differences from bloomfilter package - -- $differences - -- * Types Hash, Salt, @@ -89,16 +81,51 @@ import Data.BloomFilter.Hash import Prelude hiding (elem, notElem, read) +-- $setup +-- +-- >>> import Text.Printf + +-- $overview +-- +-- Each of the functions for creating Bloom filters accepts a 'BloomSize'. The +-- size determines the number of bits that should be used for the filter. Note +-- that a filter is fixed in size; it cannot be resized after creation. +-- +-- The size can be specified by asking for a target false positive rate (FPR) +-- or a number of bits per element, and the number of elements in the filter. +-- For example: +-- +-- * @'sizeForFPR' 1e-3 10_000@ for a Bloom filter sized for 10,000 elements +-- with a false positive rate of 1 in 1000 +-- +-- * @'sizeForBits' 10 10_000@ for a Bloom filter sized for 10,000 elements +-- with 10 bits per element +-- +-- Depending on the application it may be more important to target a fixed +-- amount of memory to use, or target a specific FPR. +-- +-- As a very rough guide for filter sizes, here are a range of FPRs and bits +-- per element: +-- +-- * FPR of 1e-1 requires approximately 4.8 bits per element +-- * FPR of 1e-2 requires approximately 9.6 bits per element +-- * FPR of 1e-3 requires approximately 14.4 bits per element +-- * FPR of 1e-4 requires approximately 19.2 bits per element +-- * FPR of 1e-5 requires approximately 24.0 bits per element +-- +-- >>> fmap (printf "%0.1f" . policyBits . policyForFPR) [1e-1, 1e-2, 1e-3, 1e-4, 1e-5] :: [String] +-- ["4.8","9.6","14.4","19.2","24.0"] + -- | Create an immutable Bloom filter, using the given setup function -- which executes in the 'ST' monad. -- -- Example: -- --- @ ---filter = create (sizeForBits 16 2) 4 $ \mf -> do --- insert mf \"foo\" --- insert mf \"bar\" --- @ +-- >>> :{ +-- filter = create (sizeForBits 16 2) 4 $ \mf -> do +-- insert mf "foo" +-- insert mf "bar" +-- :} -- -- Note that the result of the setup function is not used. create :: BloomSize @@ -173,7 +200,6 @@ unfold bloomsize bloomsalt f k = Nothing -> pure () Just (a, j') -> insert mb a >> loop j' - {-# INLINEABLE fromList #-} -- | Create a Bloom filter, populating it from a sequence of values. -- @@ -192,10 +218,11 @@ fromList policy bsalt xs = where bsize = sizeForPolicy policy (length xs) -{-# SPECIALISE deserialise :: BloomSize - -> Salt - -> (MutableByteArray RealWorld -> Int -> Int -> IO ()) - -> IO (Bloom a) #-} +{-# SPECIALISE deserialise :: + BloomSize + -> Salt + -> (MutableByteArray RealWorld -> Int -> Int -> IO ()) + -> IO (Bloom a) #-} deserialise :: PrimMonad m => BloomSize -> Salt @@ -205,75 +232,3 @@ deserialise bloomsalt bloomsize fill = do mbloom <- stToPrim $ new bloomsalt bloomsize Internal.deserialise mbloom fill stToPrim $ unsafeFreeze mbloom - --- $overview --- --- Each of the functions for creating Bloom filters accepts a 'BloomSize'. The --- size determines the number of bits that should be used for the filter. Note --- that a filter is fixed in size; it cannot be resized after creation. --- --- The size can be specified by asking for a target false positive rate (FPR) --- or a number of bits per element, and the number of elements in the filter. --- For example: --- --- * @'sizeForFPR' 1e-3 10_000@ for a Bloom filter sized for 10,000 elements --- with a false positive rate of 1 in 1000 --- --- * @'sizeForBits' 10 10_000@ for a Bloom filter sized for 10,000 elements --- with 10 bits per element --- --- Depending on the application it may be more important to target a fixed --- amount of memory to use, or target a specific FPR. --- --- As a very rough guide for filter sizes, here are a range of FPRs and bits --- per element: --- --- * FPR of 1e-1 requires approximately 4.8 bits per element --- * FPR of 1e-2 requires approximately 9.6 bits per element --- * FPR of 1e-3 requires approximately 14.4 bits per element --- * FPR of 1e-4 requires approximately 19.2 bits per element --- * FPR of 1e-5 requires approximately 24.0 bits per element --- - --- $example --- --- This example reads a dictionary file containing one word per line, --- constructs a Bloom filter with a 1% false positive rate, and --- spellchecks its standard input. Like the Unix @spell@ command, it --- prints each word that it does not recognize. --- --- @ --- import Data.Maybe (mapMaybe) --- import qualified Data.BloomFilter as B --- --- main = do --- filt \<- B.fromList (B.policyForFPR 0.01) . words \<$> readFile "\/usr\/share\/dict\/words" --- let check word | B.elem word filt = Nothing --- | otherwise = Just word --- interact (unlines . mapMaybe check . lines) --- @ - --- $differences --- --- This package is an entirely rewritten fork of --- [bloomfilter](https://hackage.haskell.org/package/bloomfilter) package. --- --- The main differences are --- --- * This packages support bloomfilters of arbitrary sizes --- (not limited to powers of two). Also sizes over 2^32 are supported. --- --- * The 'Bloom' and 'MBloom' types are parametrised over a 'Hashable' type --- class, instead of having a @a -> ['Hash']@ typed field. --- This separation allows clean de\/serialization of Bloom filters in this --- package, as the hashing scheme is static. --- --- * [@XXH3@ hash](https://xxhash.com/) is used instead of Jenkins' --- @lookup3@. --- --- * Support for both classic and \"blocked\" Bloom filters. Blocked-structured --- Bloom filters arrange all the bits for each insert or lookup into a single --- cache line, which greatly reduces the number of slow uncached memory reads. --- The trade-off for this performance optimisation is a slightly worse --- trade-off between bits per element and the FPR. In practice for typical --- FPRs of 1-e3 -- 1e-4, this requires a couple extra bits per element. diff --git a/bloomfilter/src/Data/BloomFilter/Classic/BitArray.hs b/bloomfilter-blocked/src/Data/BloomFilter/Classic/BitArray.hs similarity index 100% rename from bloomfilter/src/Data/BloomFilter/Classic/BitArray.hs rename to bloomfilter-blocked/src/Data/BloomFilter/Classic/BitArray.hs diff --git a/bloomfilter/src/Data/BloomFilter/Classic/Calc.hs b/bloomfilter-blocked/src/Data/BloomFilter/Classic/Calc.hs similarity index 97% rename from bloomfilter/src/Data/BloomFilter/Classic/Calc.hs rename to bloomfilter-blocked/src/Data/BloomFilter/Classic/Calc.hs index be21a1eac..2f5fc0f4f 100644 --- a/bloomfilter/src/Data/BloomFilter/Classic/Calc.hs +++ b/bloomfilter-blocked/src/Data/BloomFilter/Classic/Calc.hs @@ -127,8 +127,7 @@ policyFPR BloomPolicy { -- | Parameters for constructing a Bloom filter. -- data BloomSize = BloomSize { - -- | The requested number of bits in filter. - -- The actual size will be rounded up to the nearest 512. + -- | The requested number of bits in the filter. sizeBits :: !Int, -- | The number of hash functions to use. diff --git a/bloomfilter/src/Data/BloomFilter/Classic/Internal.hs b/bloomfilter-blocked/src/Data/BloomFilter/Classic/Internal.hs similarity index 97% rename from bloomfilter/src/Data/BloomFilter/Classic/Internal.hs rename to bloomfilter-blocked/src/Data/BloomFilter/Classic/Internal.hs index ed08da5b3..b4a83b905 100644 --- a/bloomfilter/src/Data/BloomFilter/Classic/Internal.hs +++ b/bloomfilter-blocked/src/Data/BloomFilter/Classic/Internal.hs @@ -116,8 +116,8 @@ new BloomSize { sizeBits, sizeHashes } mbHashSalt = do mbBitArray } --- | The maximum filter size is $2^48$ bits. Tell us if you need bigger bloom --- filters. +-- | The maximum filter size is @2^48@ bits (256 terabytes). Tell us if you need +-- bigger bloom filters. -- maxSizeBits :: Int maxSizeBits = 0x1_0000_0000_0000 @@ -316,10 +316,9 @@ word64ToWordShim# x# = x# -- Hashes -- --- | A pair of hashes used for a double hashing scheme. --- --- See 'evalHashes'. +-- | A small family of hashes, for probing bits in a classic bloom filter. data Hashes a = Hashes !Hash !Hash +-- pair of hashes used for a double hashing scheme. type role Hashes nominal instance Prim (Hashes a) where @@ -433,9 +432,8 @@ https://github.com/facebook/rocksdb/blob/096fb9b67d19a9a180e7c906b4a0cdb2b2d0c1f evalHashes :: Hashes a -> Int -> Hash evalHashes (Hashes h1 h2) i = h1 + (h2 `unsafeShiftR` i) --- | Create 'Hashes' structure. --- --- It's simply hashes the value twice using seed 0 and 1. +-- | Create a 'Hashes' structure. hashesWithSalt :: Hashable a => Salt -> a -> Hashes a +-- It simply hashes the value twice using seed 0 and 1. hashesWithSalt salt v = Hashes (hashSalt64 salt v) (hashSalt64 (salt + 1) v) {-# INLINE hashesWithSalt #-} diff --git a/bloomfilter/src/Data/BloomFilter/Hash.hs b/bloomfilter-blocked/src/Data/BloomFilter/Hash.hs similarity index 100% rename from bloomfilter/src/Data/BloomFilter/Hash.hs rename to bloomfilter-blocked/src/Data/BloomFilter/Hash.hs diff --git a/bloomfilter/tests/bloomfilter-tests.hs b/bloomfilter-blocked/tests/bloomfilter-tests.hs similarity index 86% rename from bloomfilter/tests/bloomfilter-tests.hs rename to bloomfilter-blocked/tests/bloomfilter-tests.hs index 59e5e30cb..e169c9fa8 100644 --- a/bloomfilter/tests/bloomfilter-tests.hs +++ b/bloomfilter-blocked/tests/bloomfilter-tests.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE TypeFamilies #-} module Main (main) where import qualified Data.BloomFilter.Blocked as Bloom.Blocked @@ -16,6 +17,7 @@ import Test.QuickCheck.Instances () import Test.Tasty import Test.Tasty.QuickCheck +import Data.Kind (Type) import Prelude hiding (elem, notElem) main :: IO () @@ -123,14 +125,14 @@ prop_calc_policy_fpr proxy (FPR lb, FPR ub) t (FPR fpr) = where (~~~) = withinTolerance t -prop_calc_policy_bits :: BloomFilter bloom => Proxy bloom +prop_calc_policy_bits :: forall bloom. BloomFilter bloom => Proxy bloom -> (BitsPerEntry, BitsPerEntry) -> Double -> BitsPerEntry -> Property prop_calc_policy_bits proxy (BitsPerEntry lb, BitsPerEntry ub) t (BitsPerEntry c) = c >= lb && c <= ub ==> let policy = policyForBits proxy c - c' = B.policyBits policy + c' = policyBits (Proxy @bloom) policy fpr = policyFPR proxy policy policy' = policyForFPR proxy fpr fpr' = policyFPR proxy policy' @@ -139,22 +141,22 @@ prop_calc_policy_bits proxy (BitsPerEntry lb, BitsPerEntry ub) t (~~~) = withinTolerance t -- | Compare @sizeHashes . sizeForBits@ against @numHashFunctions@ -prop_calc_size_hashes_bits :: BloomFilter bloom => Proxy bloom +prop_calc_size_hashes_bits :: forall bloom. BloomFilter bloom => Proxy bloom -> BitsPerEntry -> NumEntries -> Property prop_calc_size_hashes_bits proxy (BitsPerEntry c) (NumEntries numEntries) = let bsize = sizeForBits proxy c numEntries - in numHashFunctions (fromIntegral (B.sizeBits bsize)) + in numHashFunctions (fromIntegral (sizeBits (Proxy @bloom) bsize)) (fromIntegral numEntries) - === fromIntegral (B.sizeHashes bsize) + === fromIntegral (sizeHashes (Proxy @bloom) bsize) -- | Compare @sizeForFPR@ against @falsePositiveRate@ with some tolerance for deviations -prop_calc_size_fpr_fpr :: BloomFilter bloom => Proxy bloom +prop_calc_size_fpr_fpr :: forall bloom. BloomFilter bloom => Proxy bloom -> FPR -> NumEntries -> Property prop_calc_size_fpr_fpr proxy (FPR fpr) (NumEntries numEntries) = let bsize = sizeForFPR proxy fpr numEntries - in falsePositiveRate (fromIntegral (B.sizeBits bsize)) + in falsePositiveRate (fromIntegral (sizeBits (Proxy @bloom) bsize)) (fromIntegral numEntries) - (fromIntegral (B.sizeHashes bsize)) + (fromIntegral (sizeHashes (Proxy @bloom) bsize)) ~~~ fpr where (~~~) = withinTolerance tolerance @@ -171,14 +173,14 @@ prop_calc_size_fpr_fpr proxy (FPR fpr) (NumEntries numEntries) = | otherwise = 1e-3 -- | Compare @sizeForBits@ against @falsePositiveRate@ with some tolerance for deviations -prop_calc_size_fpr_bits :: BloomFilter bloom => Proxy bloom +prop_calc_size_fpr_bits :: forall bloom. BloomFilter bloom => Proxy bloom -> BitsPerEntry -> NumEntries -> Property prop_calc_size_fpr_bits proxy (BitsPerEntry bpe) (NumEntries numEntries) = let policy = policyForBits proxy bpe bsize = sizeForPolicy proxy policy numEntries - in falsePositiveRate (fromIntegral (B.sizeBits bsize)) + in falsePositiveRate (fromIntegral (sizeBits (Proxy @bloom) bsize)) (fromIntegral numEntries) - (fromIntegral (B.sizeHashes bsize)) + (fromIntegral (sizeHashes (Proxy @bloom) bsize)) ~~~ policyFPR proxy policy where (~~~) = withinTolerance tolerance @@ -273,18 +275,36 @@ prop_insertMany (FPR fpr) keys = ------------------------------------------------------------------------------- class BloomFilter bloom where - fromList :: Hashable a => B.BloomPolicy -> B.Salt -> [a] -> bloom a + type BloomPolicy bloom :: Type + + policyBits :: Proxy bloom -> BloomPolicy bloom -> Double + + type BloomSize bloom :: Type + + sizeBits :: Proxy bloom -> BloomSize bloom -> Int + sizeHashes :: Proxy bloom -> BloomSize bloom -> Int + + fromList :: Hashable a => BloomPolicy bloom -> B.Salt -> [a] -> bloom a elem :: Hashable a => a -> bloom a -> Bool notElem :: Hashable a => a -> bloom a -> Bool - sizeForFPR :: Proxy bloom -> B.FPR -> B.NumEntries -> B.BloomSize - sizeForBits :: Proxy bloom -> B.BitsPerEntry -> B.NumEntries -> B.BloomSize - sizeForPolicy :: Proxy bloom -> B.BloomPolicy -> B.NumEntries -> B.BloomSize - policyForFPR :: Proxy bloom -> B.FPR -> B.BloomPolicy - policyForBits :: Proxy bloom -> B.BitsPerEntry -> B.BloomPolicy - policyFPR :: Proxy bloom -> B.BloomPolicy -> B.FPR + sizeForFPR :: Proxy bloom -> B.FPR -> B.NumEntries -> BloomSize bloom + sizeForBits :: Proxy bloom -> B.BitsPerEntry -> B.NumEntries -> BloomSize bloom + sizeForPolicy :: Proxy bloom -> BloomPolicy bloom -> B.NumEntries -> BloomSize bloom + policyForFPR :: Proxy bloom -> B.FPR -> BloomPolicy bloom + policyForBits :: Proxy bloom -> B.BitsPerEntry -> BloomPolicy bloom + policyFPR :: Proxy bloom -> BloomPolicy bloom -> B.FPR instance BloomFilter Bloom.Classic.Bloom where + type instance BloomPolicy Bloom.Classic.Bloom = Bloom.Classic.BloomPolicy + + policyBits _ = Bloom.Classic.policyBits + + type instance BloomSize Bloom.Classic.Bloom = Bloom.Classic.BloomSize + + sizeBits _ = Bloom.Classic.sizeBits + sizeHashes _ = Bloom.Classic.sizeHashes + fromList = Bloom.Classic.fromList elem = Bloom.Classic.elem notElem = Bloom.Classic.notElem @@ -297,6 +317,15 @@ instance BloomFilter Bloom.Classic.Bloom where policyFPR _ = Bloom.Classic.policyFPR instance BloomFilter Bloom.Blocked.Bloom where + type instance BloomPolicy Bloom.Blocked.Bloom = Bloom.Blocked.BloomPolicy + + policyBits _ = Bloom.Blocked.policyBits + + type instance BloomSize Bloom.Blocked.Bloom = Bloom.Blocked.BloomSize + + sizeBits _ = Bloom.Blocked.sizeBits + sizeHashes _ = Bloom.Blocked.sizeHashes + fromList = Bloom.Blocked.fromList elem = Bloom.Blocked.elem notElem = Bloom.Blocked.notElem diff --git a/bloomfilter/tests/fpr-calc.hs b/bloomfilter-blocked/tests/fpr-calc.hs similarity index 86% rename from bloomfilter/tests/fpr-calc.hs rename to bloomfilter-blocked/tests/fpr-calc.hs index 6fb5a467a..b58f55f55 100644 --- a/bloomfilter/tests/fpr-calc.hs +++ b/bloomfilter-blocked/tests/fpr-calc.hs @@ -1,8 +1,7 @@ {-# LANGUAGE ParallelListComp #-} module Main (main) where -import qualified Data.BloomFilter as B (BitsPerEntry, BloomPolicy, BloomSize, - FPR, Hashable, Salt) +import qualified Data.BloomFilter as B (BitsPerEntry, FPR, Hashable, Salt) import qualified Data.BloomFilter.Blocked as B.Blocked import qualified Data.BloomFilter.Classic as B.Classic @@ -99,7 +98,7 @@ main_generateData = do ys_classic_actual = ys_actual classicBloomImpl xs_classic ys_blocked_actual = ys_actual blockedBloomImpl xs_blocked - ys_calc :: BloomImpl b -> [(Double, StdGen)] -> [Double] + ys_calc :: BloomImpl b p s -> [(Double, StdGen)] -> [Double] ys_calc BloomImpl{..} xs = [ fpr | (bitsperkey, _) <- xs @@ -107,7 +106,7 @@ main_generateData = do fpr = policyFPR policy ] - ys_actual :: BloomImpl b -> [(Double, StdGen)] -> [Double] + ys_actual :: BloomImpl b p s -> [(Double, StdGen)] -> [Double] ys_actual impl@BloomImpl{..} xs = withStrategy (parList rseq) -- eval in parallel [ fpr @@ -133,14 +132,14 @@ main_generateData = do ] -} -actualFalsePositiveRate :: BloomImpl bloom - -> B.BloomPolicy -> Int -> StdGen -> Double +actualFalsePositiveRate :: BloomImpl bloom policy size + -> policy -> Int -> StdGen -> Double actualFalsePositiveRate bloomimpl policy n g0 = fromIntegral (countFalsePositives bloomimpl policy n g0) / fromIntegral n -countFalsePositives :: forall bloom. BloomImpl bloom - -> B.BloomPolicy -> Int -> StdGen -> Int +countFalsePositives :: forall bloom policy size. BloomImpl bloom policy size + -> policy -> Int -> StdGen -> Int countFalsePositives BloomImpl{..} policy n g0 = let (!g01, !g02) = splitGen g0 @@ -173,18 +172,18 @@ countFalsePositives BloomImpl{..} policy n g0 = where (!x, !g') = uniform g -data BloomImpl bloom = BloomImpl { - policyForBits :: B.BitsPerEntry -> B.BloomPolicy, - policyForFPR :: B.FPR -> B.BloomPolicy, - policyBits :: B.BloomPolicy -> B.BitsPerEntry, - policyFPR :: B.BloomPolicy -> B.FPR, - sizeForPolicy :: B.BloomPolicy -> Int -> B.BloomSize, +data BloomImpl bloom policy size = BloomImpl { + policyForBits :: B.BitsPerEntry -> policy, + policyForFPR :: B.FPR -> policy, + policyBits :: policy -> B.BitsPerEntry, + policyFPR :: policy -> B.FPR, + sizeForPolicy :: policy -> Int -> size, unfold :: forall a b. B.Hashable a - => B.BloomSize -> B.Salt -> (b -> Maybe (a, b)) -> b -> bloom a, + => size -> B.Salt -> (b -> Maybe (a, b)) -> b -> bloom a, elem :: forall a. B.Hashable a => a -> bloom a -> Bool } -classicBloomImpl :: BloomImpl B.Classic.Bloom +classicBloomImpl :: BloomImpl B.Classic.Bloom B.Classic.BloomPolicy B.Classic.BloomSize classicBloomImpl = BloomImpl { policyForBits = B.Classic.policyForBits, @@ -196,7 +195,7 @@ classicBloomImpl = elem = B.Classic.elem } -blockedBloomImpl :: BloomImpl B.Blocked.Bloom +blockedBloomImpl :: BloomImpl B.Blocked.Bloom B.Blocked.BloomPolicy B.Blocked.BloomSize blockedBloomImpl = BloomImpl { policyForBits = B.Blocked.policyForBits, diff --git a/xxhash/include/HsXXHash.h b/bloomfilter-blocked/xxhash/include/HsXXHash.h similarity index 100% rename from xxhash/include/HsXXHash.h rename to bloomfilter-blocked/xxhash/include/HsXXHash.h diff --git a/xxhash/src/FFI.hs b/bloomfilter-blocked/xxhash/src/FFI.hs similarity index 98% rename from xxhash/src/FFI.hs rename to bloomfilter-blocked/xxhash/src/FFI.hs index 472f5c367..3ddcec135 100644 --- a/xxhash/src/FFI.hs +++ b/bloomfilter-blocked/xxhash/src/FFI.hs @@ -19,7 +19,7 @@ module FFI ( ) where import Data.Word (Word32, Word64, Word8) -import Foreign.C.Types (CInt (..), CSize (..)) +import Foreign.C.Types (CSize (..)) import Foreign.Ptr (Ptr) import GHC.Exts (ByteArray#, MutableByteArray#) diff --git a/xxhash/src/XXH3.hs b/bloomfilter-blocked/xxhash/src/XXH3.hs similarity index 98% rename from xxhash/src/XXH3.hs rename to bloomfilter-blocked/xxhash/src/XXH3.hs index f37182552..8a110d1fc 100644 --- a/xxhash/src/XXH3.hs +++ b/bloomfilter-blocked/xxhash/src/XXH3.hs @@ -18,12 +18,10 @@ module XXH3 ( xxh3_64bit_update_w32, ) where -import Control.Monad (unless) import Control.Monad.ST (ST) import Control.Monad.ST.Unsafe (unsafeIOToST) import Data.ByteString.Internal (ByteString (..), accursedUnutterablePerformIO) -import Data.Coerce (coerce) import qualified Data.Primitive as P import Data.Primitive.ByteArray (ByteArray (..)) import Data.Word (Word32, Word64) diff --git a/xxhash/tests/xxhash-tests.hs b/bloomfilter-blocked/xxhash/tests/xxhash-tests.hs similarity index 100% rename from xxhash/tests/xxhash-tests.hs rename to bloomfilter-blocked/xxhash/tests/xxhash-tests.hs diff --git a/xxhash/xxHash-0.8.2/LICENSE-xxHash b/bloomfilter-blocked/xxhash/xxHash-0.8.2/LICENSE-xxHash similarity index 100% rename from xxhash/xxHash-0.8.2/LICENSE-xxHash rename to bloomfilter-blocked/xxhash/xxHash-0.8.2/LICENSE-xxHash diff --git a/xxhash/xxHash-0.8.2/xxhash.h b/bloomfilter-blocked/xxhash/xxHash-0.8.2/xxhash.h similarity index 100% rename from xxhash/xxHash-0.8.2/xxhash.h rename to bloomfilter-blocked/xxhash/xxHash-0.8.2/xxhash.h diff --git a/bloomfilter/LICENSE-bloomfilter b/bloomfilter/LICENSE-bloomfilter deleted file mode 100644 index b998dd8a3..000000000 --- a/bloomfilter/LICENSE-bloomfilter +++ /dev/null @@ -1,32 +0,0 @@ -Copyright 2008 Bryan O'Sullivan . -Copyright (c) 2023 IOG Singapore Pte. Ltd. -Copyright (c) 2024 Cardano Development Foundation - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -3. Neither the name of the author nor the names of his contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS ``AS IS'' AND ANY EXPRESS -OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. diff --git a/bloomfilter/README.markdown b/bloomfilter/README.markdown deleted file mode 100644 index c5ed69542..000000000 --- a/bloomfilter/README.markdown +++ /dev/null @@ -1,29 +0,0 @@ -# A fast, space efficient Bloom filter implementation - -Copyright 2008, 2009, 2010, 2011 Bryan O'Sullivan . - -This package provides both mutable and immutable Bloom filter data -types, along with a family of hash function and an easy-to-use -interface. - -To build: - - cabal install bloomfilter - -For examples of usage, see the Haddock documentation and the files in -the examples directory. - - -# Get involved! - -Please report bugs via the -[github issue tracker](https://github.com/haskell-pkg-janitors/bloomfilter). - -Master [git repository](https://github.com/haskell-pkg-janitors/bloomfilter): - -* `git clone git://github.com/haskell-pkg-janitors/bloomfilter.git` - - -# Authors - -This library is written by Bryan O'Sullivan, . diff --git a/bloomfilter/src/Data/BloomFilter.hs b/bloomfilter/src/Data/BloomFilter.hs deleted file mode 100644 index 064a3349b..000000000 --- a/bloomfilter/src/Data/BloomFilter.hs +++ /dev/null @@ -1,5 +0,0 @@ -module Data.BloomFilter ( - module Data.BloomFilter.Classic - ) where - -import Data.BloomFilter.Classic diff --git a/cabal.project.release b/cabal.project.release index 983c972b6..1ba1883e6 100644 --- a/cabal.project.release +++ b/cabal.project.release @@ -6,6 +6,7 @@ index-state: packages: . ./blockio + ./bloomfilter-blocked tests: True benchmarks: True diff --git a/lsm-tree.cabal b/lsm-tree.cabal index 5acd79f27..3c25af76f 100644 --- a/lsm-tree.cabal +++ b/lsm-tree.cabal @@ -1,7 +1,7 @@ -cabal-version: 3.4 -name: lsm-tree -version: 0.1.0.0 -synopsis: Log-structured merge-trees +cabal-version: 3.4 +name: lsm-tree +version: 0.1.0.0 +synopsis: Log-structured merge-trees description: This package contains an efficient implementation of on-disk key–value storage, implemented as a log-structured merge-tree or LSM-tree. An LSM-tree is a data structure for key–value mappings, similar to "Data.Map", but optimized for large tables with a high insertion volume. @@ -477,7 +477,7 @@ description: \"Constructing and analyzing the LSM compaction design space.\" [doi:10.14778/3476249.3476274](https://doi.org/10.14778/3476249.3476274) -license: Apache-2.0 +license: Apache-2.0 license-files: LICENSE NOTICE @@ -485,24 +485,15 @@ license-files: author: Duncan Coutts, Joris Dral, Matthias Heinzel, Wolfgang Jeltsch, Wen Kokke, and Alex Washburn -maintainer: TODO: MAINTAINER EMAIL +maintainer: TODO: MAINTAINER EMAIL copyright: (c) 2023 Input Output Global, Inc. (IOG) (c) 2023-2025 INTERSECT -category: Database -build-type: Simple -tested-with: - GHC ==9.2 || ==9.4 || ==9.6 || ==9.8 || ==9.10 || ==9.12 - -extra-doc-files: CHANGELOG.md -extra-source-files: - xxhash/include/HsXXHash.h - xxhash/xxHash-0.8.2/xxhash.h - -license-files: - bloomfilter/LICENSE-bloomfilter - xxhash/xxHash-0.8.2/LICENSE-xxHash +category: Database +build-type: Simple +tested-with: GHC ==9.2 || ==9.4 || ==9.6 || ==9.8 || ==9.10 || ==9.12 +extra-doc-files: CHANGELOG.md source-repository head type: git @@ -610,6 +601,7 @@ library , base >=4.16 && <4.22 , bitvec ^>=1.1 , blockio ^>=0.1 + , bloomfilter-blocked , bytestring ^>=0.11.4.0 || ^>=0.12.1.0 , cborg ^>=0.2.10.0 , containers ^>=0.6 || ^>=0.7 @@ -620,7 +612,6 @@ library , fs-api ^>=0.4 , io-classes ^>=1.6 || ^>=1.7 || ^>=1.8.0.1 , io-classes:strict-mvar - , lsm-tree:bloomfilter , lsm-tree:control , lsm-tree:kmerge , primitive ^>=0.9 @@ -636,119 +627,6 @@ library build-depends: data-elevator ^>=0.1.0.2 || ^>=0.2 cpp-options: -DHAVE_STRICT_ARRAY --- this exists due windows -library xxhash - import: language - visibility: private - include-dirs: xxhash/xxHash-0.8.2/ xxhash/include/ - includes: - HsXXHash.h - xxhash.h - - exposed-modules: XXH3 - - if (arch(x86_64) && !os(osx)) - -- Cabal doesn't pass cc-options to "ordinary" Haskell source compilation - -- https://github.com/haskell/cabal/issues/9801 - ghc-options: -optc=-mavx2 -optc=-O3 - - other-modules: FFI - hs-source-dirs: xxhash/src - build-depends: - , base <5 - , bytestring - , primitive ^>=0.9 - -test-suite xxhash-tests - import: language - type: exitcode-stdio-1.0 - hs-source-dirs: xxhash/tests - main-is: xxhash-tests.hs - build-depends: - , base <5 - , bytestring - , lsm-tree:xxhash - , primitive - , tasty - , tasty-hunit - , tasty-quickcheck - --- this fork doesn't work on 32bit systems -library bloomfilter - import: language, warnings - visibility: private - hs-source-dirs: bloomfilter/src - build-depends: - , base >=4.16 && <5 - , bytestring >=0.9 - , deepseq - , lsm-tree:xxhash - , primitive - - exposed-modules: - Data.BloomFilter - Data.BloomFilter.Blocked - Data.BloomFilter.Classic - Data.BloomFilter.Hash - - other-modules: - Data.BloomFilter.Blocked.BitArray - Data.BloomFilter.Blocked.Calc - Data.BloomFilter.Blocked.Internal - Data.BloomFilter.Classic.BitArray - Data.BloomFilter.Classic.Calc - Data.BloomFilter.Classic.Internal - - ghc-options: -O2 - -test-suite bloomfilter-tests - import: language, warnings - type: exitcode-stdio-1.0 - hs-source-dirs: bloomfilter/tests - main-is: bloomfilter-tests.hs - build-depends: - , base <5 - , bytestring - , lsm-tree:bloomfilter - , quickcheck-instances - , tasty - , tasty-quickcheck - -benchmark bloomfilter-bench - import: language - type: exitcode-stdio-1.0 - hs-source-dirs: bloomfilter/bench - main-is: bloomfilter-bench.hs - build-depends: - , base - , criterion - , lsm-tree:bloomfilter - , random - -executable bloomfilter-fpr-calc - import: language, warnings - scope: private - hs-source-dirs: bloomfilter/tests - main-is: fpr-calc.hs - build-depends: - , base - , containers - , lsm-tree:bloomfilter - , parallel - , random - , regression-simple - - ghc-options: -threaded - -executable bloomfilter-spell - import: language - scope: private - hs-source-dirs: bloomfilter/examples - main-is: spell.hs - build-depends: - , base - , lsm-tree:bloomfilter - library extras import: language, warnings visibility: private @@ -864,6 +742,7 @@ test-suite lsm-tree-test , bitvec , blockio , blockio:sim + , bloomfilter-blocked , bytestring , cborg , constraints @@ -881,7 +760,6 @@ test-suite lsm-tree-test , io-classes:strict-stm , io-sim , lsm-tree - , lsm-tree:bloomfilter , lsm-tree:control , lsm-tree:extras , lsm-tree:prototypes @@ -929,8 +807,9 @@ benchmark lsm-tree-micro-bench Bench.Database.LSMTree.Internal.WriteBuffer build-depends: - , base <5 + , base <5 , blockio + , bloomfilter-blocked , bytestring , containers , contra-tracer @@ -939,7 +818,6 @@ benchmark lsm-tree-micro-bench , directory , fs-api , lsm-tree - , lsm-tree:bloomfilter , lsm-tree:control , lsm-tree:extras , QuickCheck @@ -955,9 +833,9 @@ benchmark lsm-tree-bench-bloomfilter hs-source-dirs: bench/macro main-is: lsm-tree-bench-bloomfilter.hs build-depends: - , base <5 + , base <5 + , bloomfilter-blocked , lsm-tree - , lsm-tree:bloomfilter , lsm-tree:extras , random , time @@ -972,13 +850,13 @@ benchmark lsm-tree-bench-lookups hs-source-dirs: bench/macro main-is: lsm-tree-bench-lookups.hs build-depends: - , base <5 + , base <5 , blockio + , bloomfilter-blocked , deepseq , fs-api , io-classes , lsm-tree - , lsm-tree:bloomfilter , lsm-tree:control , lsm-tree:extras , primitive