diff --git a/bench/micro/Bench/Database/LSMTree/Internal/Index.hs b/bench/micro/Bench/Database/LSMTree/Internal/Index.hs index 4f36648d6..a352dbe3d 100644 --- a/bench/micro/Bench/Database/LSMTree/Internal/Index.hs +++ b/bench/micro/Bench/Database/LSMTree/Internal/Index.hs @@ -11,15 +11,13 @@ import Criterion.Main (Benchmark, Benchmarkable, bench, bgroup, env, import Data.List (foldl') -- foldl' is included in the Prelude from base 4.20 onwards #endif -import Database.LSMTree.Extras.Generators (getKeyForIndexCompact, - mkPages, toAppends) +import Database.LSMTree.Extras.Generators (mkPages, toAppends) -- also for @Arbitrary@ instantiation of @SerialisedKey@ import Database.LSMTree.Extras.Index (Append, append) import Database.LSMTree.Internal.Index (Index, IndexType (Compact, Ordinary), newWithDefaults, search, unsafeEnd) -import Database.LSMTree.Internal.Serialise - (SerialisedKey (SerialisedKey)) +import Database.LSMTree.Internal.Serialise (SerialisedKey) import Test.QuickCheck (choose, vector) import Test.QuickCheck.Gen (Gen (MkGen)) import Test.QuickCheck.Random (mkQCGen) @@ -61,8 +59,7 @@ generated (MkGen exec) = exec (mkQCGen 411) 30 keysForIndexCompact :: Int -- ^ Number of keys -> [SerialisedKey] -- ^ Constructed keys keysForIndexCompact = vector >>> - generated >>> - map (getKeyForIndexCompact >>> SerialisedKey) + generated {-| Constructs append operations whose serialised keys conform to the key size diff --git a/doc/format-run.md b/doc/format-run.md index 3607e669b..6ccc90d69 100644 --- a/doc/format-run.md +++ b/doc/format-run.md @@ -198,8 +198,9 @@ big-endian. The compact index type is designed to work with keys that are large cryptographic hashes, e.g. 32 bytes. In particular it requires: -* keys must be uniformly distributed -* keys must be at least 8 bytes (64bits), but can otherwise be variable length +* keys must be uniformly distributed; +* keys can be of variable length; +* keys less than 8 bytes (64bits) are padded with zeros (in LSB position). For this important special case, we can do significantly better than storing a whole key per page: we can typically store just 8 bytes (64bits) per page. This diff --git a/src-extras/Database/LSMTree/Extras/Generators.hs b/src-extras/Database/LSMTree/Extras/Generators.hs index d82c58150..1b58942c4 100644 --- a/src-extras/Database/LSMTree/Extras/Generators.hs +++ b/src-extras/Database/LSMTree/Extras/Generators.hs @@ -33,8 +33,6 @@ module Database.LSMTree.Extras.Generators ( , genRawBytesSized , packRawBytesPinnedOrUnpinned , LargeRawBytes (..) - , isKeyForIndexCompact - , KeyForIndexCompact (..) , BiasedKey (..) -- * helpers , shrinkVec @@ -510,28 +508,6 @@ instance Arbitrary LargeRawBytes where deriving newtype instance SerialiseValue LargeRawBytes --- Serialised keys for the compact index must be at least 8 bytes long. - -genKeyForIndexCompact :: Gen RawBytes -genKeyForIndexCompact = - genRawBytesN =<< QC.sized (\s -> QC.chooseInt (8, s + 8)) - -isKeyForIndexCompact :: RawBytes -> Bool -isKeyForIndexCompact rb = RB.size rb >= 8 - -newtype KeyForIndexCompact = - KeyForIndexCompact { getKeyForIndexCompact :: RawBytes } - deriving stock (Eq, Ord, Show) - -instance Arbitrary KeyForIndexCompact where - arbitrary = - KeyForIndexCompact <$> genKeyForIndexCompact - shrink (KeyForIndexCompact rawBytes) = - [KeyForIndexCompact rawBytes' | rawBytes' <- shrink rawBytes, - isKeyForIndexCompact rawBytes'] - -deriving newtype instance SerialiseKey KeyForIndexCompact - -- we try to make collisions and close keys more likely (very crudely) arbitraryBiasedKey :: (RawBytes -> k) -> Gen RawBytes -> Gen k arbitraryBiasedKey fromRB genUnbiased = fromRB <$> frequency diff --git a/src/Database/LSMTree.hs b/src/Database/LSMTree.hs index 4b2fb4eac..b08130fcb 100644 --- a/src/Database/LSMTree.hs +++ b/src/Database/LSMTree.hs @@ -149,7 +149,6 @@ module Database.LSMTree ( serialiseKeyIdentity, serialiseKeyIdentityUpToSlicing, serialiseKeyPreservesOrdering, - serialiseKeyMinimalSize, serialiseValueIdentity, serialiseValueIdentityUpToSlicing, packSlice, @@ -227,8 +226,7 @@ import Database.LSMTree.Internal.Config DiskCachePolicy (..), FencePointerIndexType (..), LevelNo (..), MergeBatchSize (..), MergePolicy (..), MergeSchedule (..), SizeRatio (..), TableConfig (..), - WriteBufferAlloc (..), defaultTableConfig, - serialiseKeyMinimalSize) + WriteBufferAlloc (..), defaultTableConfig) import Database.LSMTree.Internal.Config.Override (TableConfigOverride (..), noTableConfigOverride) import Database.LSMTree.Internal.Entry (NumEntries (..)) diff --git a/src/Database/LSMTree/Internal/Config.hs b/src/Database/LSMTree/Internal/Config.hs index 15d405fdd..4c1755a87 100644 --- a/src/Database/LSMTree/Internal/Config.hs +++ b/src/Database/LSMTree/Internal/Config.hs @@ -20,7 +20,6 @@ module Database.LSMTree.Internal.Config ( -- * Fence pointer index , FencePointerIndexType (..) , indexTypeForRun - , serialiseKeyMinimalSize -- * Disk cache policy , DiskCachePolicy (..) , diskCachePolicyForLevel @@ -36,11 +35,9 @@ import Database.LSMTree.Internal.Index (IndexType) import qualified Database.LSMTree.Internal.Index as Index (IndexType (Compact, Ordinary)) import qualified Database.LSMTree.Internal.MergingRun as MR -import qualified Database.LSMTree.Internal.RawBytes as RB import Database.LSMTree.Internal.Run (RunDataCaching (..)) import Database.LSMTree.Internal.RunAcc (RunBloomFilterAlloc (..)) import Database.LSMTree.Internal.RunBuilder (RunParams (..)) -import Database.LSMTree.Internal.Serialise.Class (SerialiseKey (..)) newtype LevelNo = LevelNo Int deriving stock (Show, Eq, Ord) @@ -321,12 +318,11 @@ data FencePointerIndexType = | {- | Compact indexes are designed for the case where the keys in the database are uniformly distributed, e.g., when the keys are hashes. - When using a compact index, the 'Database.LSMTree.Internal.Serialise.Class.serialiseKey' function must satisfy the following additional law: + When using a compact index, some requirements apply to serialised keys: - [Minimal size] - @'Database.LSMTree.Internal.RawBytes.size' ('Database.LSMTree.Internal.Serialise.Class.serialiseKey' x) >= 8@ - - Use 'serialiseKeyMinimalSize' to test this law. + * keys must be uniformly distributed; + * keys can be of variable length; + * keys less than 8 bytes (64bits) are padded with zeros (in LSB position). -} CompactIndex deriving stock (Eq, Show) @@ -339,10 +335,6 @@ indexTypeForRun :: FencePointerIndexType -> IndexType indexTypeForRun CompactIndex = Index.Compact indexTypeForRun OrdinaryIndex = Index.Ordinary --- | Test the __Minimal size__ law for the 'CompactIndex' option. -serialiseKeyMinimalSize :: SerialiseKey k => k -> Bool -serialiseKeyMinimalSize x = RB.size (serialiseKey x) >= 8 - {------------------------------------------------------------------------------- Disk cache policy -------------------------------------------------------------------------------} diff --git a/src/Database/LSMTree/Internal/MergeSchedule.hs b/src/Database/LSMTree/Internal/MergeSchedule.hs index d050999a3..aae9247da 100644 --- a/src/Database/LSMTree/Internal/MergeSchedule.hs +++ b/src/Database/LSMTree/Internal/MergeSchedule.hs @@ -673,7 +673,7 @@ addRunToLevels tr conf@TableConfig{..} resolve hfs hbio root salt uc r0 reg leve traceWith tr $ AtLevel ln TraceAddLevel -- Make a new level let policyForLevel = mergePolicyForLevel confMergePolicy ln V.empty ul - ir <- newMerge policyForLevel MR.MergeLastLevel ln rs + ir <- newMerge policyForLevel (mergeTypeForLevel V.empty ul) ln rs pure $! V.singleton $ Level ir V.empty go !ln rs' (V.uncons -> Just (Level ir rs, ls)) = do r <- expectCompletedMerge ln ir @@ -714,7 +714,7 @@ addRunToLevels tr conf@TableConfig{..} resolve hfs hbio root salt uc r0 reg leve -- Otherwise we start merging the incoming runs into the run. LevelLevelling -> do assert (V.null rs && V.null ls) $ pure () - ir' <- newMerge LevelLevelling MR.MergeLastLevel ln (rs' `V.snoc` r) + ir' <- newMerge LevelLevelling (mergeTypeForLevel ls ul) ln (rs' `V.snoc` r) pure $! Level ir' V.empty `V.cons` V.empty -- Releases the incoming run. diff --git a/src/Database/LSMTree/Internal/RawBytes.hs b/src/Database/LSMTree/Internal/RawBytes.hs index 04609ec4a..e810f282d 100644 --- a/src/Database/LSMTree/Internal/RawBytes.hs +++ b/src/Database/LSMTree/Internal/RawBytes.hs @@ -51,7 +51,7 @@ module Database.LSMTree.Internal.RawBytes ( ) where import Control.DeepSeq (NFData) -import Control.Exception (assert) +import Data.Bits (Bits (..)) import Data.BloomFilter.Hash (Hashable (..), hashByteArray) import qualified Data.ByteString as BS import qualified Data.ByteString.Builder as BB @@ -71,6 +71,9 @@ import GHC.Stack import GHC.Word import Text.Printf (printf) +-- $setup +-- >>> import Numeric + {- Note: [Export structure] ~~~~~~~~~~~~~~~~~~~~~~~ Since RawBytes are very similar to Primitive Vectors, the code is sectioned @@ -172,14 +175,29 @@ drop = coerce VP.drop -- -- The /top/ corresponds to the most significant bit (big-endian). -- --- PRECONDITION: The byte-size of the raw bytes should be at least 8 bytes. +-- If the number of bits is smaller than @64@, then any missing bits default to +-- @0@s. +-- +-- >>> showHex (topBits64 (pack [1,0,0,0,0,0,0,0])) "" +-- "100000000000000" +-- +-- >>> showHex (topBits64 (pack [1,0,0])) "" +-- "100000000000000" -- -- TODO: optimisation ideas: use unsafe shift/byteswap primops, look at GHC -- core, find other opportunities for using primops. -- topBits64 :: RawBytes -> Word64 -topBits64 rb@(RawBytes (VP.Vector (I# off#) _size (ByteArray k#))) = - assert (size rb >= 8) $ toWord64 (indexWord8ArrayAsWord64# k# off#) +topBits64 rb@(RawBytes v@(VP.Vector (I# off#) _size (ByteArray k#))) + | n >= 8 + = toWord64 (indexWord8ArrayAsWord64# k# off#) + | otherwise + = VP.foldl' f 0 v `unsafeShiftL` ((8 - n) * 8) + where + !n = size rb + + f :: Word64 -> Word8 -> Word64 + f acc w = acc `unsafeShiftL` 8 + fromIntegral w #if (MIN_VERSION_GLASGOW_HASKELL(9, 4, 0, 0)) toWord64 :: Word64# -> Word64 diff --git a/src/Database/LSMTree/Simple.hs b/src/Database/LSMTree/Simple.hs index 4978fae42..5292297af 100644 --- a/src/Database/LSMTree/Simple.hs +++ b/src/Database/LSMTree/Simple.hs @@ -134,7 +134,6 @@ module Database.LSMTree.Simple ( serialiseKeyIdentity, serialiseKeyIdentityUpToSlicing, serialiseKeyPreservesOrdering, - serialiseKeyMinimalSize, serialiseValueIdentity, serialiseValueIdentityUpToSlicing, packSlice, @@ -182,9 +181,8 @@ import Database.LSMTree (BloomFilterAlloc, CursorClosedError (..), UnionCredits (..), UnionDebt (..), WriteBufferAlloc, isValidSnapshotName, noTableConfigOverride, packSlice, serialiseKeyIdentity, serialiseKeyIdentityUpToSlicing, - serialiseKeyMinimalSize, serialiseKeyPreservesOrdering, - serialiseValueIdentity, serialiseValueIdentityUpToSlicing, - toSnapshotName) + serialiseKeyPreservesOrdering, serialiseValueIdentity, + serialiseValueIdentityUpToSlicing, toSnapshotName) import qualified Database.LSMTree as LSMT import qualified Database.LSMTree.Internal.Types as LSMT import qualified Database.LSMTree.Internal.Unsafe as Internal diff --git a/test/Test/Database/LSMTree/Generators.hs b/test/Test/Database/LSMTree/Generators.hs index 7be339533..e75ddce44 100644 --- a/test/Test/Database/LSMTree/Generators.hs +++ b/test/Test/Database/LSMTree/Generators.hs @@ -64,9 +64,6 @@ tests = testGroup "Test.Database.LSMTree.Generators" [ prop_arbitraryAndShrinkPreserveInvariant (\(LargeRawBytes rb) -> labelRawBytes rb) (deepseqInvariant @LargeRawBytes) - , testGroup "KeyForIndexCompact" $ - prop_arbitraryAndShrinkPreserveInvariant noTags $ - isKeyForIndexCompact . getKeyForIndexCompact , testGroup "BiasedKey" $ prop_arbitraryAndShrinkPreserveInvariant (labelTestKOps @BiasedKey) diff --git a/test/Test/Database/LSMTree/Internal/Index/Compact.hs b/test/Test/Database/LSMTree/Internal/Index/Compact.hs index 51ed0a35f..5dabb7161 100644 --- a/test/Test/Database/LSMTree/Internal/Index/Compact.hs +++ b/test/Test/Database/LSMTree/Internal/Index/Compact.hs @@ -28,7 +28,7 @@ import Data.Word import Database.LSMTree.Extras import Database.LSMTree.Extras.Generators (ChunkSize (..), LogicalPageSummaries, LogicalPageSummary (..), Pages (..), - genRawBytes, isKeyForIndexCompact, labelPages, toAppends) + genRawBytes, labelPages, toAppends) import Database.LSMTree.Extras.Index (Append (..), appendToCompact) import Database.LSMTree.Internal.BitMath import Database.LSMTree.Internal.Chunk as Chunk (toByteString) @@ -54,9 +54,7 @@ import Text.Printf (printf) tests :: TestTree tests = testGroup "Test.Database.LSMTree.Internal.Index.Compact" [ - testGroup "TestKey" $ - prop_arbitraryAndShrinkPreserveInvariant @TestKey noTags isTestKey - , testProperty "prop_distribution @TestKey" $ + testProperty "prop_distribution @TestKey" $ prop_distribution @TestKey , testProperty "prop_searchMinMaxKeysAfterConstruction" $ prop_searchMinMaxKeysAfterConstruction @TestKey 100 @@ -173,15 +171,12 @@ instance Arbitrary TestKey where -- Shrink keys extensively: most failures will occur in small counterexamples, -- so we don't have to limit the number of shrinks as much. shrink (TestKey bytes) = [ - TestKey bytes' + testkey' | let RawBytes vec = bytes , vec' <- VP.fromList <$> shrink (VP.toList vec) - , let bytes' = RawBytes vec' - , isKeyForIndexCompact bytes' + , let testkey' = TestKey $ RawBytes vec' ] -isTestKey :: TestKey -> Bool -isTestKey (TestKey bytes) = isKeyForIndexCompact bytes {------------------------------------------------------------------------------- Properties diff --git a/test/Test/Database/LSMTree/Internal/Lookup.hs b/test/Test/Database/LSMTree/Internal/Lookup.hs index e1b000fcb..5c0421cf8 100644 --- a/test/Test/Database/LSMTree/Internal/Lookup.hs +++ b/test/Test/Database/LSMTree/Internal/Lookup.hs @@ -60,7 +60,7 @@ import qualified Database.LSMTree.Internal.Run as Run import Database.LSMTree.Internal.RunAcc as Run import Database.LSMTree.Internal.RunBuilder (RunDataCaching (CacheRunData), RunParams (RunParams)) -import Database.LSMTree.Internal.Serialise +import Database.LSMTree.Internal.Serialise as Serialise import Database.LSMTree.Internal.Serialise.Class import Database.LSMTree.Internal.UniqCounter import qualified Database.LSMTree.Internal.WriteBuffer as WB @@ -569,14 +569,10 @@ liftShrink3InMemLookupData shrinkKey shrinkValue shrinkBlob InMemLookupData{ run shrinkEntry = liftShrink2 shrinkValue shrinkBlob genSerialisedKey :: Gen SerialisedKey -genSerialisedKey = frequency [ - (9, arbitrary `suchThat` (\k -> sizeofKey k >= 8)) - , (1, do x <- getSmall <$> arbitrary - pure $ SerialisedKey (RB.pack [0,0,0,0,0,0,0, x])) - ] +genSerialisedKey = Serialise.serialiseKey <$> arbitraryBoundedIntegral @Word64 shrinkSerialisedKey :: SerialisedKey -> [SerialisedKey] -shrinkSerialisedKey k = [k' | k' <- shrink k, sizeofKey k' >= 8] +shrinkSerialisedKey k = Serialise.serialiseKey <$> shrink (Serialise.deserialiseKey k :: Word64) genSerialisedValue :: Gen SerialisedValue genSerialisedValue = frequency [ (50, arbitrary), (1, genLongValue) ] diff --git a/test/Test/Database/LSMTree/Internal/RawBytes.hs b/test/Test/Database/LSMTree/Internal/RawBytes.hs index 4652ed06d..5f7ddb6d8 100644 --- a/test/Test/Database/LSMTree/Internal/RawBytes.hs +++ b/test/Test/Database/LSMTree/Internal/RawBytes.hs @@ -1,8 +1,13 @@ +{-# LANGUAGE OverloadedLists #-} + module Test.Database.LSMTree.Internal.RawBytes (tests) where +import Data.Bits (Bits (shiftL)) +import qualified Data.List as List +import qualified Data.Vector.Primitive as VP import Database.LSMTree.Extras.Generators () -import Database.LSMTree.Internal.RawBytes (RawBytes) -import qualified Database.LSMTree.Internal.RawBytes as RB (size) +import Database.LSMTree.Internal.RawBytes (RawBytes (RawBytes)) +import qualified Database.LSMTree.Internal.RawBytes as RB import Test.QuickCheck (Property, classify, collect, mapSize, withDiscardRatio, withMaxSuccess, (.||.), (===), (==>)) import Test.Tasty (TestTree, testGroup) @@ -26,7 +31,9 @@ tests = testGroup "Test.Database.LSMTree.Internal.RawBytes" $ testProperty "Transitivity" prop_ordTransitivity, testProperty "Reflexivity" prop_ordReflexivity, testProperty "Antisymmetry" prop_ordAntisymmetry - ] + ], + testProperty "prop_topBits64" prop_topBits64, + testProperty "prop_topBits64_default0s" prop_topBits64_default0s ] -- * Utilities @@ -92,3 +99,22 @@ prop_ordAntisymmetry = mapSize (const 4) $ untunedProp block1 block2 = withFirstBlockSizeInfo block1 $ block1 <= block2 && block2 <= block1 ==> block1 === block2 + +{------------------------------------------------------------------------------- + Accessors +-------------------------------------------------------------------------------} + +-- | Compare 'topBits64' against a model +prop_topBits64 :: RawBytes -> Property +prop_topBits64 x@(RawBytes v) = + expected === RB.topBits64 x + where + expected = + let ws = take 8 (VP.toList v ++ repeat 0) + in List.foldl' (\acc w -> acc `shiftL` 8 + fromIntegral w) 0 ws + +-- | If @x@ has fewer than 8 bytes, then all missing bits in the result default +-- to 0s. +prop_topBits64_default0s :: RawBytes -> Property +prop_topBits64_default0s x = + RB.topBits64 x === RB.topBits64 (x <> mconcat (replicate 8 [0])) diff --git a/test/Test/Database/LSMTree/Internal/RunAcc.hs b/test/Test/Database/LSMTree/Internal/RunAcc.hs index 36c49b92b..44450e498 100644 --- a/test/Test/Database/LSMTree/Internal/RunAcc.hs +++ b/test/Test/Database/LSMTree/Internal/RunAcc.hs @@ -143,8 +143,7 @@ fromProtoValue (Proto.Value bs) = SerialisedValue . RB.fromShortByteString $ SBS fromProtoBlobRef :: Proto.BlobRef -> BlobSpan fromProtoBlobRef (Proto.BlobRef x y) = BlobSpan x y --- | Wrapper around 'PageLogical' that generates nearly-full pages, and --- keys that are always large enough (>= 8 bytes) for the compact index. +-- | Wrapper around 'PageLogical' that generates nearly-full pages. newtype PageLogical' = PageLogical' { getPrototypeKOps :: [(Proto.Key, Proto.Operation)] } deriving stock Show @@ -153,7 +152,7 @@ getRealKOps = fmap fromProtoKOp . getPrototypeKOps instance Arbitrary PageLogical' where arbitrary = PageLogical' <$> - Proto.genPageContentFits Proto.DiskPage4k (Proto.MinKeySize 8) + Proto.genPageContentFits Proto.DiskPage4k Proto.noMinKeySize shrink (PageLogical' page) = [ PageLogical' page' | page' <- shrink page ] diff --git a/test/Test/Database/LSMTree/StateMachine.hs b/test/Test/Database/LSMTree/StateMachine.hs index 0024524e5..e817e8db1 100644 --- a/test/Test/Database/LSMTree/StateMachine.hs +++ b/test/Test/Database/LSMTree/StateMachine.hs @@ -92,11 +92,11 @@ import qualified Database.LSMTree as R import Database.LSMTree.Class (Entry (..), LookupResult (..)) import qualified Database.LSMTree.Class as Class import Database.LSMTree.Extras (showPowersOf) -import Database.LSMTree.Extras.Generators (KeyForIndexCompact) +import Database.LSMTree.Extras.Generators () import Database.LSMTree.Extras.NoThunks (propNoThunks) import qualified Database.LSMTree.Internal.Config as R (TableConfig (..)) import Database.LSMTree.Internal.Serialise (SerialisedBlob, - SerialisedValue) + SerialisedKey, SerialisedValue) import qualified Database.LSMTree.Internal.Types as R.Types import qualified Database.LSMTree.Internal.Unsafe as R.Unsafe import qualified Database.LSMTree.Model.IO as ModelIO @@ -574,7 +574,7 @@ handleFsError = Model.ErrFsError . displayException Key and value types -------------------------------------------------------------------------------} -newtype Key = Key KeyForIndexCompact +newtype Key = Key SerialisedKey deriving stock (Show, Eq, Ord) deriving newtype (Arbitrary, R.SerialiseKey)