From 3764a73d27bac00e4d00ed879560a2e556a5a231 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Wed, 10 Apr 2024 19:36:53 +0300 Subject: [PATCH 1/4] Implement functions which use Applicatives generateA is used as primitive and all other functions are expressed in it terms. First version goes through intermediate list. This is simplest implementation possible and would serve as baseline for further optimizations --- vector/src/Data/Vector/Generic.hs | 38 +++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/vector/src/Data/Vector/Generic.hs b/vector/src/Data/Vector/Generic.hs index 35acd0fb..29b75ddc 100644 --- a/vector/src/Data/Vector/Generic.hs +++ b/vector/src/Data/Vector/Generic.hs @@ -145,6 +145,9 @@ module Data.Vector.Generic ( scanr, scanr', scanr1, scanr1', iscanr, iscanr', + -- * Applicative API + replicateA, generateA, traverse, itraverse, + -- * Conversions -- ** Lists @@ -197,7 +200,8 @@ import Data.Vector.Internal.Check import Control.Monad.ST ( ST, runST ) import Control.Monad.Primitive import Prelude - ( Eq, Ord, Num, Enum, Monoid, Monad, Read, Show, Bool, Ordering(..), Int, Maybe(..), Either, IO, ShowS, ReadS, String + ( Eq, Ord, Num, Enum, Monoid, Applicative, Monad, Read, Show, Bool, Ordering(..) + , Int, Maybe(..), Either, IO, ShowS, ReadS, String , compare, mempty, mappend, return, fmap, otherwise, id, flip, seq, error, undefined, uncurry, shows, fst, snd, min, max, not , (>>=), (+), (-), (*), (<), (==), (.), ($), (=<<), (>>), (<$>) ) @@ -210,7 +214,7 @@ import Data.Typeable ( Typeable, gcast1 ) import Data.Data ( Data, DataType, Constr, Fixity(Prefix), mkDataType, mkConstr, constrIndex, mkNoRepType ) -import qualified Data.Traversable as T (Traversable(mapM)) +import qualified Data.Traversable as T (Traversable(mapM,traverse)) -- Length information -- ------------------ @@ -2641,6 +2645,36 @@ clone v = v `seq` New.create ( unsafeCopy mv v return mv) +-- Applicatives +-- ------------ + +-- | Execute the applicative action the given number of times and store the +-- results in a vector. +replicateA :: (Vector v a, Applicative f) => Int -> f a -> f (v a) +{-# INLINE replicateA #-} +replicateA n f = generateA n (\_ -> f) + + +-- | Construct a vector of the given length by applying the monadic +-- action to each index. +generateA :: (Vector v a, Applicative f) => Int -> (Int -> f a) -> f (v a) +{-# INLINE generateA #-} +generateA n f = fromListN n <$> T.traverse f [0 .. n-1] + +-- | Apply the applicative action to all elements of the vector, yielding a +-- vector of results. +traverse :: (Vector v a, Vector v b, Applicative f) + => (a -> f b) -> v a -> f (v b) +{-# INLINE traverse #-} +traverse f v = generateA (length v) $ \i -> f (unsafeIndex v i) + +-- | Apply the applicative action to every element of a vector and its +-- index, yielding a vector of results. +itraverse :: (Vector v a, Vector v b, Applicative f) + => (Int -> a -> f b) -> v a -> f (v b) +{-# INLINE itraverse #-} +itraverse f v = generateA (length v) $ \i -> f i (unsafeIndex v i) + -- Comparisons -- ----------- From 56dbd80c5e650c34c0e62e75986e9572673d6bbb Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Thu, 31 Oct 2024 22:10:50 +0300 Subject: [PATCH 2/4] First version of benchmarks for applicative functions We establish implementation which goes through list as baseline and the we can try to optimize it. Note definition of foldlOf'. It's different from definition in lens<=5.3.3 but it's absolutely necessary to get good perfomance in folds --- vector-bench-papi/benchmarks/Main.hs | 12 +++ .../benchlib/Bench/Vector/Algo/Applicative.hs | 101 ++++++++++++++++++ vector/benchmarks/Main.hs | 13 +++ vector/vector.cabal | 1 + 4 files changed, 127 insertions(+) create mode 100644 vector/benchlib/Bench/Vector/Algo/Applicative.hs diff --git a/vector-bench-papi/benchmarks/Main.hs b/vector-bench-papi/benchmarks/Main.hs index 590a7573..284e557f 100644 --- a/vector-bench-papi/benchmarks/Main.hs +++ b/vector-bench-papi/benchmarks/Main.hs @@ -12,6 +12,8 @@ import Bench.Vector.Algo.Spectral (spectral) import Bench.Vector.Algo.Tridiag (tridiag) import Bench.Vector.Algo.FindIndexR (findIndexR, findIndexR_naive, findIndexR_manual) import Bench.Vector.Algo.NextPermutation (generatePermTests) +import Bench.Vector.Algo.Applicative ( generateState, generateStateUnfold, generateIO, generateIOPrim + , lensSum, lensMap, baselineSum, baselineMap) import Bench.Vector.TestData.ParenTree (parenTree) import Bench.Vector.TestData.Graph (randomGraph) @@ -68,4 +70,14 @@ main = do , bench "minimumOn" $ whnf (U.minimumOn (\x -> x*x*x)) as , bench "maximumOn" $ whnf (U.maximumOn (\x -> x*x*x)) as , bgroup "(next|prev)Permutation" $ map (\(name, act) -> bench name $ whnfIO act) permTests + , bgroup "Applicative" + [ bench "generateState" $ whnf generateState useSize + , bench "generateStateUnfold" $ whnf generateStateUnfold useSize + , bench "generateIO" $ whnfIO (generateIO useSize) + , bench "generateIOPrim" $ whnfIO (generateIOPrim useSize) + , bench "sum[lens]" $ whnf lensSum as + , bench "sum[base]" $ whnf baselineSum as + , bench "map[lens]" $ whnf lensMap as + , bench "map[base]" $ whnf baselineMap as + ] ] diff --git a/vector/benchlib/Bench/Vector/Algo/Applicative.hs b/vector/benchlib/Bench/Vector/Algo/Applicative.hs new file mode 100644 index 00000000..06f4cd74 --- /dev/null +++ b/vector/benchlib/Bench/Vector/Algo/Applicative.hs @@ -0,0 +1,101 @@ +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ScopedTypeVariables #-} +-- | +-- This module provides benchmarks for functions which use API based +-- on applicative. We use @generateA@ based benchmark for state and IO +-- and also benchmark folds and mapping using lens since it's one of +-- important consumers of this API. +module Bench.Vector.Algo.Applicative + ( -- * Standard benchmarks + generateState + , generateStateUnfold + , generateIO + , generateIOPrim + -- * Lens benchmarks + , lensSum + , baselineSum + , lensMap + , baselineMap + ) where + +import Control.Applicative +import Data.Coerce +import Data.Functor.Identity +import Data.Int +import Data.Monoid +import Data.Word +import qualified Data.Vector.Generic as VG +import qualified Data.Vector.Generic.Mutable as MVG +import qualified Data.Vector.Unboxed as VU +import System.Random.Stateful +import System.Mem (getAllocationCounter) + +-- | Benchmark which is running in state monad. +generateState :: Int -> VU.Vector Word64 +generateState n + = runStateGen_ (mkStdGen 42) + $ \g -> VG.generateA n (\_ -> uniformM g) + +-- | Benchmark which is running in state monad. +generateStateUnfold :: Int -> VU.Vector Word64 +generateStateUnfold n = VU.unfoldrExactN n genWord64 (mkStdGen 42) + +-- | Benchmark for running @generateA@ in IO monad. +generateIO :: Int -> IO (VU.Vector Int64) +generateIO n = VG.generateA n (\_ -> getAllocationCounter) + +-- | Baseline for 'generateIO' it uses primitive operations +generateIOPrim :: Int -> IO (VU.Vector Int64) +generateIOPrim n = VG.unsafeFreeze =<< MVG.replicateM n getAllocationCounter + +-- | Sum using lens +lensSum :: VU.Vector Double -> Double +{-# NOINLINE lensSum #-} +lensSum = foldlOf' VG.traverse (+) 0 + +-- | Baseline for sum. +baselineSum :: VU.Vector Double -> Double +{-# NOINLINE baselineSum #-} +baselineSum = VU.sum + +-- | Mapping over vector elements using +lensMap :: VU.Vector Double -> VU.Vector Double +{-# NOINLINE lensMap #-} +lensMap = over VG.traverse (*2) + +-- | Baseline for map +baselineMap :: VU.Vector Double -> VU.Vector Double +{-# NOINLINE baselineMap #-} +baselineMap = VU.map (*2) + +---------------------------------------------------------------- +-- Bits and pieces of lens +-- +-- We don't want to depend on lens so we just copy relevant +-- parts. After all we don't need much +---------------------------------------------------------------- + +type ASetter s t a b = (a -> Identity b) -> s -> Identity t +type Getting r s a = (a -> Const r a) -> s -> Const r s + +foldlOf' :: Getting (Endo (Endo r)) s a -> (r -> a -> r) -> r -> s -> r +foldlOf' l f z0 = \xs -> + let f' x (Endo k) = Endo $ \z -> k $! f z x + in foldrOf l f' (Endo id) xs `appEndo` z0 +{-# INLINE foldlOf' #-} + +foldrOf :: Getting (Endo r) s a -> (a -> r -> r) -> r -> s -> r +foldrOf l f z = flip appEndo z . foldMapOf l (Endo #. f) +{-# INLINE foldrOf #-} + +foldMapOf :: Getting r s a -> (a -> r) -> s -> r +foldMapOf = coerce +{-# INLINE foldMapOf #-} + +( #. ) :: Coercible c b => (b -> c) -> (a -> b) -> (a -> c) +( #. ) _ = coerce (\x -> x :: b) :: forall a b. Coercible b a => a -> b +{-# INLINE (#.) #-} + +over :: ASetter s t a b -> (a -> b) -> s -> t +over = coerce +{-# INLINE over #-} diff --git a/vector/benchmarks/Main.hs b/vector/benchmarks/Main.hs index f8aad4ea..79adb3b9 100644 --- a/vector/benchmarks/Main.hs +++ b/vector/benchmarks/Main.hs @@ -1,6 +1,7 @@ {-# LANGUAGE BangPatterns #-} module Main where + import Bench.Vector.Algo.MutableSet (mutableSet) import Bench.Vector.Algo.ListRank (listRank) import Bench.Vector.Algo.Rootfix (rootfix) @@ -12,6 +13,8 @@ import Bench.Vector.Algo.Spectral (spectral) import Bench.Vector.Algo.Tridiag (tridiag) import Bench.Vector.Algo.FindIndexR (findIndexR, findIndexR_naive, findIndexR_manual) import Bench.Vector.Algo.NextPermutation (generatePermTests) +import Bench.Vector.Algo.Applicative ( generateState, generateStateUnfold, generateIO, generateIOPrim + , lensSum, lensMap, baselineSum, baselineMap) import Bench.Vector.TestData.ParenTree (parenTree) import Bench.Vector.TestData.Graph (randomGraph) @@ -69,4 +72,14 @@ main = do , bench "minimumOn" $ whnf (U.minimumOn (\x -> x*x*x)) as , bench "maximumOn" $ whnf (U.maximumOn (\x -> x*x*x)) as , bgroup "(next|prev)Permutation" $ map (\(name, act) -> bench name $ whnfIO act) permTests + , bgroup "Applicative" + [ bench "generateState" $ whnf generateState useSize + , bench "generateStateUnfold" $ whnf generateStateUnfold useSize + , bench "generateIO" $ whnfIO (generateIO useSize) + , bench "generateIOPrim" $ whnfIO (generateIOPrim useSize) + , bench "sum[lens]" $ whnf lensSum as + , bench "sum[base]" $ whnf baselineSum as + , bench "map[lens]" $ whnf lensMap as + , bench "map[base]" $ whnf baselineMap as + ] ] diff --git a/vector/vector.cabal b/vector/vector.cabal index f53c9798..df2ce382 100644 --- a/vector/vector.cabal +++ b/vector/vector.cabal @@ -287,6 +287,7 @@ library benchmarks-O2 Bench.Vector.Algo.Quickhull Bench.Vector.Algo.Spectral Bench.Vector.Algo.Tridiag + Bench.Vector.Algo.Applicative Bench.Vector.Algo.FindIndexR Bench.Vector.Algo.NextPermutation Bench.Vector.TestData.ParenTree From bb38bf5f7fd1b8a1d6b366f0ba8d3ba69229c2e4 Mon Sep 17 00:00:00 2001 From: Alexey Khudyakov Date: Wed, 17 Apr 2024 23:59:03 +0300 Subject: [PATCH 3/4] Implement STA optimization trick as an optimization Does wonders for traversals using Identity --- vector/src/Data/Vector/Generic.hs | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/vector/src/Data/Vector/Generic.hs b/vector/src/Data/Vector/Generic.hs index 29b75ddc..15d36873 100644 --- a/vector/src/Data/Vector/Generic.hs +++ b/vector/src/Data/Vector/Generic.hs @@ -200,11 +200,10 @@ import Data.Vector.Internal.Check import Control.Monad.ST ( ST, runST ) import Control.Monad.Primitive import Prelude - ( Eq, Ord, Num, Enum, Monoid, Applicative, Monad, Read, Show, Bool, Ordering(..) + ( Eq(..), Ord(..), Num, Enum, Monoid, Applicative(..), Monad, Read, Show, Bool, Ordering(..) , Int, Maybe(..), Either, IO, ShowS, ReadS, String , compare, mempty, mappend, return, fmap, otherwise, id, flip, seq, error, undefined, uncurry, shows, fst, snd, min, max, not - , (>>=), (+), (-), (*), (<), (==), (.), ($), (=<<), (>>), (<$>) ) - + , (>>=), (+), (-), (*), (.), ($), (=<<), (>>), (<$>)) import qualified Text.Read as Read import qualified Data.List.NonEmpty as NonEmpty @@ -2648,6 +2647,19 @@ clone v = v `seq` New.create ( -- Applicatives -- ------------ + + +newtype STA v a = STA { + _runSTA :: forall s. Mutable v s a -> ST s (v a) +} + +runSTA :: Vector v a => Int -> STA v a -> v a +runSTA !sz = \(STA fun) -> runST $ fun =<< M.unsafeNew sz +{-# INLINE runSTA #-} + + + + -- | Execute the applicative action the given number of times and store the -- results in a vector. replicateA :: (Vector v a, Applicative f) => Int -> f a -> f (v a) @@ -2659,7 +2671,13 @@ replicateA n f = generateA n (\_ -> f) -- action to each index. generateA :: (Vector v a, Applicative f) => Int -> (Int -> f a) -> f (v a) {-# INLINE generateA #-} -generateA n f = fromListN n <$> T.traverse f [0 .. n-1] +generateA 0 _ = pure empty +generateA n f = runSTA n <$> go 0 + where + go !i | i >= n = pure $ STA unsafeFreeze + | otherwise = (\a (STA m) -> STA $ \mv -> M.unsafeWrite mv i a >> m mv) + <$> f i + <*> go (i + 1) -- | Apply the applicative action to all elements of the vector, yielding a -- vector of results. From 6ad34582071bbc46859f396f3ed02b0c465f7bcc Mon Sep 17 00:00:00 2001 From: Aleksey Khudyakov Date: Tue, 28 Jan 2025 17:27:07 +0300 Subject: [PATCH 4/4] Update vector/src/Data/Vector/Generic.hs Co-authored-by: konsumlamm <44230978+konsumlamm@users.noreply.github.com> --- vector/src/Data/Vector/Generic.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vector/src/Data/Vector/Generic.hs b/vector/src/Data/Vector/Generic.hs index 15d36873..138d2684 100644 --- a/vector/src/Data/Vector/Generic.hs +++ b/vector/src/Data/Vector/Generic.hs @@ -2667,7 +2667,7 @@ replicateA :: (Vector v a, Applicative f) => Int -> f a -> f (v a) replicateA n f = generateA n (\_ -> f) --- | Construct a vector of the given length by applying the monadic +-- | Construct a vector of the given length by applying the applicative -- action to each index. generateA :: (Vector v a, Applicative f) => Int -> (Int -> f a) -> f (v a) {-# INLINE generateA #-}