diff --git a/.gitignore b/.gitignore index e14b1e8..c9b8b02 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ TODO .emacs.desktop dist dist-newstyle +.stack-work/ +b-tree \ No newline at end of file diff --git a/Setup.hs b/Setup.hs deleted file mode 100644 index 9a994af..0000000 --- a/Setup.hs +++ /dev/null @@ -1,2 +0,0 @@ -import Distribution.Simple -main = defaultMain diff --git a/b-tree.cabal b/b-tree.cabal index 87e83b4..174c6e2 100644 --- a/b-tree.cabal +++ b/b-tree.cabal @@ -1,81 +1,93 @@ -name: b-tree -version: 0.1.4 -synopsis: Immutable disk-based B* trees -description: Immutable disk-based B* trees -homepage: http://github.com/bgamari/b-tree -license: BSD3 -license-file: LICENSE -author: Ben Gamari -maintainer: bgamari.foss@gmail.com -copyright: (c) 2014 Ben Gamari -category: Data -build-type: Simple -cabal-version: >=1.10 +cabal-version: 1.12 -source-repository head - type: git - location: git@github.com:bgamari/b-tree +-- This file has been generated from package.yaml by hpack version 0.31.2. +-- +-- see: https://github.com/sol/hpack +-- +-- hash: 75e5224834801bdcd562d1165007ce48fe2da076410bcca562bfe7670ee02342 -library - exposed-modules: BTree, - BTree.BinaryList - other-modules: BTree.Types, - BTree.Merge, - BTree.Builder, - BTree.Lookup, - BTree.Walk, - BTree.BinaryFile, - BTree.BuildUnordered +name: b-tree +version: 0.1.4 +synopsis: Immutable disk-based B* trees +description: Immutable disk-based B* trees +category: Data +homepage: http://github.com/bgamari/b-tree +author: Ben Gamari +maintainer: bgamari.foss@gmail.com +copyright: (c) 2014 Ben Gamari +license: BSD3 +license-file: LICENSE +build-type: Simple - other-extensions: DeriveGeneric, - FlexibleContexts, - TemplateHaskell, - UndecidableInstances, - StandaloneDeriving, - BangPatterns, - GeneralizedNewtypeDeriving +source-repository head + type: git + location: git@github.com:bgamari/b-tree - hs-source-dirs: src - build-depends: base >=4.6 && <4.13, - mtl >=2.0 && <3.0, - pipes >=4.1 && <4.4, - pipes-interleave >= 1.0 && <2.2, - bytestring >=0.10 && <0.11, - binary >=0.8.4 && <0.11, - transformers >=0.3 && <0.6, - lens >=3.10 && <4.18, - containers >=0.5 && <0.7, - vector >=0.10 && <0.13, - vector-binary-instances >= 0.2 && < 0.3, - errors >=2.0 && <2.4, - exceptions >= 0.8 && < 0.11, - filepath >=1.3 && <1.5, - directory >=1.2 && <1.4, - mmap >=0.5 && <0.6 - default-language: Haskell2010 - ghc-options: -Wall +library + exposed-modules: + BTree + BTree.BinaryList + other-modules: + BTree.BinaryFile + BTree.Builder + BTree.BuildUnordered + BTree.Lookup + BTree.Merge + BTree.Types + BTree.Walk + Paths_b_tree + hs-source-dirs: + src + other-extensions: DeriveGeneric FlexibleContexts TemplateHaskell UndecidableInstances StandaloneDeriving BangPatterns GeneralizedNewtypeDeriving + ghc-options: -Wall + build-depends: + base >=4.6 && <4.13 + , binary >=0.8.4 && <0.11 + , bytestring >=0.10 && <0.11 + , containers >=0.5 && <0.7 + , directory >=1.2 && <1.4 + , errors >=2.0 && <2.4 + , exceptions >=0.8 && <0.11 + , filepath >=1.3 && <1.5 + , lens >=3.10 && <4.18 + , mmap >=0.5 && <0.6 + , mtl >=2.0 && <3.0 + , pipes >=4.1 && <4.4 + , pipes-interleave >=1.0 && <2.2 + , transformers >=0.3 && <0.6 + , vector >=0.10 && <0.13 + , vector-binary-instances >=0.2 && <0.3 + default-language: Haskell2010 test-suite btree-quickcheck - type: exitcode-stdio-1.0 - main-is: QuickCheck.hs - hs-source-dirs: tests - default-language: Haskell2010 - build-depends: base, - containers, - pipes, - binary, - b-tree, - QuickCheck >= 2.8.2, - tasty, - tasty-quickcheck + type: exitcode-stdio-1.0 + main-is: QuickCheck.hs + other-modules: + TestUnorderedMerge + Paths_b_tree + hs-source-dirs: + tests + build-depends: + QuickCheck >=2.8.2 + , b-tree + , base + , binary + , containers + , pipes + , tasty + , tasty-quickcheck + default-language: Haskell2010 benchmark btree-benchmark - type: exitcode-stdio-1.0 - main-is: Benchmark.hs - hs-source-dirs: benchmarks - default-language: Haskell2010 - build-depends: base, - b-tree, - pipes, - criterion >=0.8 && <2.0 - + type: exitcode-stdio-1.0 + main-is: Benchmark.hs + other-modules: + Paths_b_tree + hs-source-dirs: + benchmarks + build-depends: + b-tree + , base + , criterion >=0.8 && <2.0 + , pipes + default-language: Haskell2010 diff --git a/package.yaml b/package.yaml new file mode 100644 index 0000000..a50f75c --- /dev/null +++ b/package.yaml @@ -0,0 +1,65 @@ +name: b-tree +version: '0.1.4' +synopsis: Immutable disk-based B* trees +description: Immutable disk-based B* trees +category: Data +author: Ben Gamari +maintainer: bgamari.foss@gmail.com +copyright: (c) 2014 Ben Gamari +license: BSD3 +homepage: http://github.com/bgamari/b-tree +git: git@github.com:bgamari/b-tree +library: + source-dirs: src + other-extensions: + - BangPatterns + - DeriveGeneric + - FlexibleContexts + - GeneralizedNewtypeDeriving + - LambdaCase + - StandaloneDeriving + - TemplateHaskell + - UndecidableInstances + ghc-options: -Wall + exposed-modules: + - BTree + - BTree.BinaryList + dependencies: + - base >=4.6 && <4.13 + - mtl >=2.0 && <3.0 + - pipes >=4.1 && <4.4 + - pipes-interleave >=1.0 && <2.2 + - bytestring >=0.10 && <0.11 + - binary >=0.8.4 && <0.11 + - transformers >=0.3 && <0.6 + - lens >=3.10 && <4.18 + - containers >=0.5 && <0.7 + - vector >=0.10 && <0.13 + - vector-binary-instances >=0.2 && <0.3 + - errors >=2.0 && <2.4 + - exceptions >=0.8 && <0.11 + - filepath >=1.3 && <1.5 + - directory >=1.2 && <1.4 + - mmap >=0.5 && <0.6 +tests: + btree-quickcheck: + main: QuickCheck.hs + source-dirs: tests + dependencies: + - base + - containers + - pipes + - binary + - b-tree + - QuickCheck >=2.8.2 + - tasty + - tasty-quickcheck +benchmarks: + btree-benchmark: + main: Benchmark.hs + source-dirs: benchmarks + dependencies: + - base + - b-tree + - pipes + - criterion >=0.8 && <2.0 diff --git a/src/BTree/BinaryFile.hs b/src/BTree/BinaryFile.hs index 5a8767e..42ead62 100644 --- a/src/BTree/BinaryFile.hs +++ b/src/BTree/BinaryFile.hs @@ -2,7 +2,9 @@ -- a trailing "header". module BTree.BinaryFile ( writeWithHeader + , hWriteWithHeader , readWithHeader + , hReadWithHeader ) where import Control.Monad (when) @@ -79,8 +81,16 @@ readWithHeader :: (MonadMask m, MonadIO m, B.Binary hdr) => FilePath -> (hdr -> Handle -> m a) -> ExceptT String m a -readWithHeader fname action = do - r <- lift $ bracket (liftIO $ openFile fname ReadMode) (liftIO . hClose) $ \h -> runExceptT $ do +readWithHeader fname action = ExceptT $ bracket (liftIO $ openFile fname ReadMode) (liftIO . hClose) + $ \h -> runExceptT $ hReadWithHeader h action + + +hReadWithHeader :: (MonadMask m, MonadIO m, B.Binary hdr) + => Handle + -> (hdr -> Handle -> m a) + -> ExceptT String m a + +hReadWithHeader h action = do -- read epilogue liftIO $ hSeek h SeekFromEnd (-epiLength) epiBytes <- liftIO (LBS.hGet h $ fromIntegral epiLength) @@ -96,4 +106,3 @@ readWithHeader fname action = do liftIO $ hSeek h AbsoluteSeek 0 lift $ action hdr h - ExceptT $ return r diff --git a/src/BTree/BinaryList.hs b/src/BTree/BinaryList.hs index c5871f0..71cb6ea 100644 --- a/src/BTree/BinaryList.hs +++ b/src/BTree/BinaryList.hs @@ -10,7 +10,6 @@ module BTree.BinaryList , stream -- * Other queries , length - , filePath ) where import Control.Applicative @@ -31,13 +30,9 @@ import Pipes import BTree.BinaryFile -- | A file containing a finite list of binary encoded items -newtype BinaryList a = BinList FilePath +newtype BinaryList a = BinList Handle deriving (Show) --- | Get the path to the @BinaryList@ file -filePath :: BinaryList a -> FilePath -filePath (BinList f) = f - data Header = Header { hdrLength :: Word64 } deriving (Show) @@ -47,9 +42,9 @@ instance B.Binary Header where -- | Encode the items of the given producer toBinaryList :: forall m a r. (MonadMask m, MonadIO m, B.Binary a) - => FilePath -> Producer a m r -> m (BinaryList a, r) + => Handle -> Producer a m r -> m (BinaryList a, r) toBinaryList fname producer = do - writeWithHeader fname (go 0 producer BB.empty) + hWriteWithHeader fname (go 0 producer BB.empty) where go :: Int -> Producer a m r -> BB.Builder -> Producer LBS.ByteString m (Header, (BinaryList a, r)) @@ -71,12 +66,16 @@ toBinaryList fname producer = do -- | Open a 'BinaryList' file. -- -- TODO: Sanity checking at open time. -open :: FilePath -> BinaryList a -open = BinList +open :: Handle -> BinaryList a +open = BinList +{-# INLINE open #-} withHeader :: (MonadMask m, MonadIO m) - => BinaryList a -> (Header -> Handle -> m b) -> ExceptT String m b -withHeader (BinList fname) action = readWithHeader fname action + => BinaryList a + -> (Header -> Handle -> m b) + -> ExceptT String m b + +withHeader (BinList fhandle) action = hReadWithHeader fhandle action length :: (MonadMask m, MonadIO m) => BinaryList a -> ExceptT String m Word64 diff --git a/src/BTree/BuildUnordered.hs b/src/BTree/BuildUnordered.hs index 942453d..458b70d 100644 --- a/src/BTree/BuildUnordered.hs +++ b/src/BTree/BuildUnordered.hs @@ -13,7 +13,7 @@ import Control.Error import Data.Traversable (forM) import qualified Data.Binary as B -import qualified Data.Set as S +import qualified Data.Map.Strict as M import System.IO import System.Directory (removeFile) @@ -28,11 +28,11 @@ import BTree.Builder maxChunkMerge :: Int maxChunkMerge = 100 -tempFilePath :: FilePath -> String -> IO FilePath -tempFilePath dir template = do - (fname, h) <- liftIO $ openTempFile dir template - hClose h - return fname + +type Resource = (FilePath,Handle) + +cleanResource :: Resource -> IO () +cleanResource (fname,fhandle) = hClose fhandle >> removeFile fname -- | Build a B-tree into the given file. -- @@ -40,51 +40,54 @@ tempFilePath dir template = do -- the sorting is handled internally through a simple merge sort. Chunks of -- leaves are collected, sorted in memory, and then written to intermediate -- trees. At the end these trees are then merged. -fromUnorderedToFile :: forall m e k r. - (MonadMask m, MonadIO m, - B.Binary (BLeaf k e), B.Binary k, B.Binary e, Ord k) +fromUnorderedToFile :: forall m k v r. + (MonadMask m, MonadIO m, B.Binary k, B.Binary v, Ord k) => FilePath -- ^ Path to scratch directory -> Int -- ^ Maximum chunk size -> Order -- ^ Order of tree -> FilePath -- ^ Output file - -> Producer (BLeaf k e) m r -- ^ 'Producer' of elements + -> (v -> v -> v) -- ^ merge functions + -> Producer (BLeaf k v) m r -- ^ 'Producer' of elements -> ExceptT String m () -fromUnorderedToFile scratch maxChunk order dest producer = {-# SCC fromUnorderedToFile #-} do - bList <- fromUnorderedToList scratch maxChunk producer - size <- BL.length bList - stream <- {-# SCC stream #-} BL.stream bList +fromUnorderedToFile scratch maxChunk order dest mergeV producer = {-# SCC fromUnorderedToFile #-} do + (bList,resource) <- fromUnorderedToList scratch maxChunk mergeV producer + size <- BL.length bList + stream <- {-# SCC stream #-} BL.stream bList lift $ {-# SCC buildTree #-} fromOrderedToFile order size dest stream - liftIO $ removeFile $ BL.filePath bList + liftIO $ cleanResource resource + {-# INLINE fromUnorderedToFile #-} -fromUnorderedToList :: forall m a r. - (MonadMask m, MonadIO m, B.Binary a, Ord a) +fromUnorderedToList :: forall m k v r. + (MonadMask m, MonadIO m, B.Binary k, B.Binary v, Ord k) => FilePath -- ^ Path to scratch directory -> Int -- ^ Maximum chunk size - -> Producer a m r -- ^ 'Producer' of elements - -> ExceptT String m (BL.BinaryList a) -fromUnorderedToList scratch maxChunk producer = do - lift (execStateT (fillLists producer) []) >>= {-# SCC goMerge #-} goMerge + -> (v -> v -> v) -- ^ merge functions + -> Producer (BLeaf k v) m r -- ^ 'Producer' of elements + -> ExceptT String m (BL.BinaryList (BLeaf k v),Resource) +fromUnorderedToList scratch maxChunk mergeV producer = lift (execStateT (fillLists producer) []) >>= {-# SCC goMerge #-} goMerge where - fillLists :: Producer a m r -> StateT [BL.BinaryList a] m r + fillLists :: Producer (BLeaf k v) m r -> StateT [(BL.BinaryList (BLeaf k v),Resource)] m r fillLists prod = {-# SCC fillLists #-} do - fname <- liftIO $ tempFilePath scratch "chunk.list" - (leaves, rest) <- lift $ takeChunk maxChunk prod - (bList, ()) <- lift $ BL.toBinaryList fname $ each $ S.toAscList leaves - modify (bList:) + resource@(_,fhandle) <- liftIO $ openTempFile scratch "chunk.list" + (leaves, rest) <- lift $ takeChunk maxChunk mergeV prod + (bList, ()) <- lift $ BL.toBinaryList fhandle $ each $ [ BLeaf k v | (k,v) <- M.toAscList leaves] + modify ((bList,resource):) case rest of Left r -> return r Right nextProd -> fillLists nextProd - goMerge :: [BL.BinaryList a] -> ExceptT String m (BL.BinaryList a) + goMerge :: [(BL.BinaryList (BLeaf k v),Resource)] -> ExceptT String m (BL.BinaryList (BLeaf k v), Resource) goMerge [l] = return l goMerge ls = do ls'' <- forM (splitChunks maxChunkMerge ls) $ \ls'->do - fname <- liftIO $ tempFilePath scratch "merged.list" - list <- mergeLists fname ls' - liftIO $ mapM_ (removeFile . BL.filePath) ls' - return list + resource@(_, fhandle) <- liftIO $ openTempFile scratch "merged.list" + let (chunk,chunkResource) = unzip ls' + list <- mergeLists fhandle mergeV chunk + liftIO $ mapM_ cleanResource chunkResource + return (list,resource) goMerge ls'' + {-# INLINE fromUnorderedToList #-} -- | Split the list into chunks of bounded size and run each through a function @@ -99,13 +102,16 @@ splitChunks chunkSize = go throwLeft :: Monad m => m (Either String r) -> m r throwLeft action = action >>= either error return -mergeLists :: (B.Binary a, Ord a, MonadMask m, MonadIO m) - => FilePath - -> [BL.BinaryList a] - -> ExceptT String m (BL.BinaryList a) -mergeLists dest lists = do +mergeLists :: (B.Binary k, Ord k, B.Binary v, MonadMask m, MonadIO m) + => Handle + -> (v -> v -> v) + -> [BL.BinaryList (BLeaf k v)] + -> ExceptT String m (BL.BinaryList (BLeaf k v)) +mergeLists dest mergeV lists = do streams <- mapM BL.stream lists - let prod = interleave (map throwLeft streams) + + let prod = leafMerger mergeV $ interleave (map throwLeft streams) + (bList, ()) <- lift $ BL.toBinaryList dest prod return bList {-# INLINE mergeLists #-} @@ -113,17 +119,37 @@ mergeLists dest lists = do -- | Take the first 'n' elements and collect them in a 'Set'. Return -- a 'Producer' which will emit the remaining elements (or the return -- value). -takeChunk :: forall m a r. (Monad m, Ord a) +takeChunk :: forall m k v r. (Monad m, Ord k) => Int - -> Producer a m r - -> m (S.Set a, Either r (Producer a m r)) -takeChunk n = go n S.empty + -> (v -> v -> v) + -> Producer (BLeaf k v) m r + -> m (M.Map k v, Either r (Producer (BLeaf k v) m r)) +takeChunk n mergeV = go n M.empty where - go :: Int -> S.Set a -> Producer a m r -> m (S.Set a, Either r (Producer a m r)) + go :: Int -> M.Map k v -> Producer (BLeaf k v) m r -> m (M.Map k v, Either r (Producer (BLeaf k v) m r)) go 0 s prod = return (s, Right prod) go i s prod = do result <- next prod case result of Left r -> return (s, Left r) - Right (a, prod') -> go (i-1) (S.insert a s) prod' + Right (BLeaf k v, prod') -> go (i-1) (M.insertWith mergeV k v s) prod' {-# INLINE takeChunk #-} + + + + +-- | 'combine' with monadic side-effects in the combine operation. +leafMerger :: (Monad m, Eq k) + => (v -> v -> v) -- ^ combine operation + -> Producer (BLeaf k v) m r -> Producer (BLeaf k v) m r +leafMerger mergeV producer = lift (next producer) >>= either return (uncurry go) + where go a@(BLeaf k v) producer' = do + n <- lift $ next producer' + case n of + Left r -> yield a >> return r + Right (a'@(BLeaf k' v'), producer'') + | k == k' -> go (BLeaf k' $ mergeV v v') producer'' + | otherwise -> yield a >> go a' producer'' +{-# INLINABLE leafMerger #-} + + diff --git a/src/BTree/Lookup.hs b/src/BTree/Lookup.hs index 75cc94b..459b869 100644 --- a/src/BTree/Lookup.hs +++ b/src/BTree/Lookup.hs @@ -3,6 +3,7 @@ module BTree.Lookup ( LookupTree , fromByteString , lookup , size + , fetch ) where import Prelude hiding (lookup) @@ -15,6 +16,7 @@ import Data.Binary import System.IO.MMap import BTree.Types +-- TODO: move to BTree.BinaryFile fetch :: (Binary a) => LookupTree k e -> OnDisk a -> a fetch lt (OnDisk offset) = decode $ LBS.fromStrict $ BS.drop (fromIntegral offset) (lt^.ltData) @@ -53,6 +55,15 @@ lookup lt k = | V.null rest -> Nothing | otherwise -> go $ fetch lt $ snd $ V.last rest + + + + + + + + + -- | How many keys are in a 'LookupTree'. size :: LookupTree k e -> Size size = _btSize . _ltHeader diff --git a/src/BTree/Walk.hs b/src/BTree/Walk.hs index 29c44e5..9fcd7d2 100644 --- a/src/BTree/Walk.hs +++ b/src/BTree/Walk.hs @@ -3,6 +3,7 @@ module BTree.Walk ( walkLeaves , walkNodes , walkNodesWithOffset + , walkNodesWithOffsetAfter ) where import BTree.Types @@ -13,6 +14,9 @@ import qualified Pipes.Prelude as PP import Data.Binary import Data.Binary.Get (runGetOrFail) import Control.Lens +import BTree.Lookup (fetch) +import qualified Data.Vector as V +import Control.Monad -- If we only look at leaves keys will increase monotonically as we -- progress through the file. @@ -53,3 +57,27 @@ walkNodesWithOffset = go 0 . {-# SCC "buffer" #-}view ltData then return (LBS.fromStrict rest, Nothing) else go (offset+o) rest {-# INLINE walkNodesWithOffset #-} + + +walkNodesWithOffsetAfter :: (Ord k, Binary k, Binary v, Monad m,MonadIO m, Show k) + => k + -> LookupTree k v + -> Producer (BLeaf k v) m () + +walkNodesWithOffsetAfter key tree = case _btRoot $ _ltHeader tree of + Nothing -> return () + Just root -> go $ fetch tree root + where + go = \case + Leaf x@(BLeaf k _) + | k >= key -> do yield x + + | otherwise -> do return () + + Node x xs -> do goList x (V.toList xs) + + goList x [] = go (fetch tree x) + + goList x1 ((k,x2):xs) + | k < key = goList x2 xs + | otherwise = go (fetch tree x1) >> goList x2 xs diff --git a/stack.yaml b/stack.yaml new file mode 100644 index 0000000..d6a1765 --- /dev/null +++ b/stack.yaml @@ -0,0 +1,8 @@ + + +resolver: lts-13.22 + +packages: + - ./ +extra-deps: +- pipes-interleave-1.1.3@sha256:8a526c6784dad7eb70d8873f13ba622724f986f074246d6e2db81a79bd9e59e6,965 \ No newline at end of file diff --git a/stack.yaml.lock b/stack.yaml.lock new file mode 100644 index 0000000..025bec9 --- /dev/null +++ b/stack.yaml.lock @@ -0,0 +1,19 @@ +# This file was autogenerated by Stack. +# You should not edit this file by hand. +# For more information, please see the documentation at: +# https://docs.haskellstack.org/en/stable/lock_files + +packages: +- completed: + hackage: pipes-interleave-1.1.3@sha256:8a526c6784dad7eb70d8873f13ba622724f986f074246d6e2db81a79bd9e59e6,965 + pantry-tree: + size: 219 + sha256: 88b833bddcc03d8b036a118d95afd96ad19485530a5ff1d239eeb9d6b6f34bd3 + original: + hackage: pipes-interleave-1.1.3@sha256:8a526c6784dad7eb70d8873f13ba622724f986f074246d6e2db81a79bd9e59e6,965 +snapshots: +- completed: + size: 498186 + url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/13/22.yaml + sha256: d4f07dc3d5658260c2fe34266ad7618f6c84d34decf559c9c786ac1cfccf4e7b + original: lts-13.22