distributed-dataset-0.0.1.0: A distributed data processing framework in pure Haskell
Safe HaskellNone
LanguageHaskell2010

Control.Distributed.Dataset

Synopsis

Documentation

data Dataset a Source #

Represents a partitioned multiset that can be transformed in a distributed fashion.

Transformations

dMap :: (StaticSerialise a, StaticSerialise b) => Closure (a -> b) -> Dataset a -> Dataset b Source #

Returns a new Dataset that contains the result of applying the given function to each element.

dFilter :: StaticSerialise a => Closure (a -> Bool) -> Dataset a -> Dataset a Source #

Returns a new Dataset that only contains elements where the given function returns true.

dConcatMap :: (StaticSerialise a, StaticSerialise b) => Closure (a -> [b]) -> Dataset a -> Dataset b Source #

Returns a new Dataset by first applying a function to all elements of this Dataset, and then flattening the results.

dGroupedAggr Source #

Arguments

:: (StaticHashable k, StaticSerialise k, StaticSerialise b) 
=> Int

Target number of partitions

-> Closure (a -> k)

Grouping key

-> Aggr a b 
-> Dataset a 
-> Dataset (k, b) 

Apply an aggregation to all rows sharing the same key.

dDistinct :: StaticHashable a => Int -> Dataset a -> Dataset a Source #

Removes a new dataset with duplicate rows removed.

dDistinctBy Source #

Arguments

:: StaticHashable b 
=> Int

Target number of partitions

-> Closure (a -> b) 
-> Dataset a 
-> Dataset a 

Removes a new dataset with rows with the duplicate keys removed.

Low-level transformations

dCoalesce :: Typeable a => Int -> Dataset a -> Dataset a Source #

Coalesce partitions together to get the specified number of partitions.

dPipe :: (StaticSerialise a, StaticSerialise b) => Closure (ConduitT a b (ResourceT IO) ()) -> Dataset a -> Dataset b Source #

Transforms a Dataset by passing every partition through the given Conduit.

dPartition :: (StaticSerialise a, StaticHashable k) => Int -> Closure (a -> k) -> Dataset a -> Dataset a Source #

Re-partition the dataset using the given function so that the items with the same k will end up in the same partition.

Aggregations

Execution

dAggr :: (StaticSerialise a, StaticSerialise b) => Aggr a b -> Dataset a -> DD b Source #

Apply an aggregation to all items on a Dataset, and fetch the result.

dFetch :: StaticSerialise a => Dataset a -> DD (ConduitT () a (ResourceT IO) ()) Source #

Returns a Conduit to fetch the results lazily to the driver.

dToList :: StaticSerialise a => Dataset a -> DD [a] Source #

Fetches the complete dataset as a list.

data DD a Source #

Instances

Instances details
Monad DD Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

(>>=) :: DD a -> (a -> DD b) -> DD b Source #

(>>) :: DD a -> DD b -> DD b Source #

return :: a -> DD a Source #

Functor DD Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

fmap :: (a -> b) -> DD a -> DD b Source #

(<$) :: a -> DD b -> DD a Source #

Applicative DD Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

pure :: a -> DD a Source #

(<*>) :: DD (a -> b) -> DD a -> DD b Source #

liftA2 :: (a -> b -> c) -> DD a -> DD b -> DD c Source #

(*>) :: DD a -> DD b -> DD b Source #

(<*) :: DD a -> DD b -> DD a Source #

MonadIO DD Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

liftIO :: IO a -> DD a Source #

MonadUnliftIO DD Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

askUnliftIO :: DD (UnliftIO DD) Source #

withRunInIO :: ((forall a. DD a -> IO a) -> IO b) -> DD b Source #

MonadThrow DD Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

throwM :: Exception e => e -> DD a Source #

MonadCatch DD Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

catch :: Exception e => DD a -> (e -> DD a) -> DD a Source #

MonadLogger DD Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

monadLoggerLog :: ToLogStr msg => Loc -> LogSource -> LogLevel -> msg -> DD () Source #

data Backend Source #

Backend is responsible for running your functions in a remote environment.

Should run the current binary in the target environment, put the given string as standard input and return the executables answer on the standard output. | BackendM is essentially IO, but also has the ability to report the status of the executor.

data ShuffleStore Source #

Provides a way to store intermediate temporary data.

Creating datasets

data Partition a Source #

Represents some amount of data which to be transformed on a single executor.

mkPartition :: Typeable a => Closure (ConduitT () a (ResourceT IO) ()) -> Partition a Source #

Create a Partition given a Source conduit.

dExternal :: StaticSerialise a => [Partition a] -> Dataset a Source #

Create a dataset from given Partition's.

Class

class (Typeable a, Serialise a) => StaticSerialise a where Source #

Instances

Instances details
StaticSerialise Bool Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise Char Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise Double Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise Float Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise Int Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise Integer Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise () Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise Text Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise a => StaticSerialise [a] Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise a => StaticSerialise (Maybe a) Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise a => StaticSerialise (Min a) Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise a => StaticSerialise (Max a) Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise a => StaticSerialise (Sum a) Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticSerialise a => StaticSerialise (Down a) Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

(StaticHashable a, StaticSerialise a) => StaticSerialise (HashSet a) Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

(StaticSerialise a, StaticSerialise b) => StaticSerialise (a, b) Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

staticSerialise :: Closure (Dict (Typeable (a, b), Serialise (a, b))) Source #

(StaticSerialise a, StaticSerialise b, StaticSerialise c) => StaticSerialise (a, b, c) Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

staticSerialise :: Closure (Dict (Typeable (a, b, c), Serialise (a, b, c))) Source #

class (Typeable a, Eq a, Hashable a) => StaticHashable a where Source #

Instances

Instances details
StaticHashable Bool Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticHashable Char Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticHashable Double Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticHashable Float Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticHashable Int Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticHashable Integer Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticHashable () Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticHashable Text Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

StaticHashable a => StaticHashable [a] Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

staticHashable :: Closure (Dict (Typeable [a], Eq [a], Hashable [a])) Source #

(StaticHashable a, StaticHashable b) => StaticHashable (a, b) Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

staticHashable :: Closure (Dict (Typeable (a, b), Eq (a, b), Hashable (a, b))) Source #

(StaticHashable a, StaticHashable b, StaticHashable c) => StaticHashable (a, b, c) Source # 
Instance details

Defined in Control.Distributed.Dataset.Internal.Class

Methods

staticHashable :: Closure (Dict (Typeable (a, b, c), Eq (a, b, c), Hashable (a, b, c))) Source #

Re-exports

initDistributedFork :: IO () Source #

On distributed-fork, we run the same binary both in your machine (called "driver") and in the remote environment (called "executor"). In order for the program to act according to where it is, you should call this function as the first thing in your main:

liftIO :: MonadIO m => IO a -> m a Source #

Lift a computation from the IO monad.

(&) :: a -> (a -> b) -> b infixl 1 Source #

& is a reverse application operator. This provides notational convenience. Its precedence is one higher than that of the forward application operator $, which allows & to be nested in $.

>>> 5 & (+1) & show
"6"

Since: base-4.8.0.0

Closure

data Closure a Source #

Type of serializable closures. Abstractly speaking, a closure is a code reference paired together with an environment. A serializable closure includes a shareable code reference (i.e. a StaticPtr). Closures can be serialized only if all expressions captured in the environment are serializable.

Instances

Instances details
IsStatic Closure 
Instance details

Defined in Control.Distributed.Closure.Internal

StaticApply Closure 
Instance details

Defined in Control.Applicative.Static

Methods

staticApply :: (Typeable a, Typeable b) => Closure (a -> b) -> Closure a -> Closure b Source #

StaticFunctor Closure 
Instance details

Defined in Data.Functor.Static

Methods

staticMap :: (Typeable a, Typeable b) => Closure (a -> b) -> Closure a -> Closure b Source #

Typeable a => Binary (Closure a) 
Instance details

Defined in Control.Distributed.Closure.Internal

Methods

put :: Closure a -> Put Source #

get :: Get (Closure a) Source #

putList :: [Closure a] -> Put Source #

cap :: Typeable a => Closure (a -> b) -> Closure a -> Closure b Source #

Closure application. Note that Closure is not a functor, let alone an applicative functor, even if it too has a meaningful notion of application.

cpure :: Closure (Dict (Serializable a)) -> a -> Closure a Source #

A closure can be created from any serializable value. cpure corresponds to Control.Applicative's pure, but restricted to lifting serializable values only.

data Dict a where Source #

Values of type Dict p capture a dictionary for a constraint of type p.

e.g.

Dict :: Dict (Eq Int)

captures a dictionary that proves we have an:

instance Eq 'Int

Pattern matching on the Dict constructor will bring this instance into scope.

Constructors

Dict :: forall a. a => Dict a 

Instances

Instances details
HasDict a (Dict a) 
Instance details

Defined in Data.Constraint

Methods

evidence :: Dict a -> Dict a Source #

a :=> (Read (Dict a)) 
Instance details

Defined in Data.Constraint

Methods

ins :: a :- Read (Dict a) Source #

a :=> (Monoid (Dict a)) 
Instance details

Defined in Data.Constraint

Methods

ins :: a :- Monoid (Dict a) Source #

a :=> (Enum (Dict a)) 
Instance details

Defined in Data.Constraint

Methods

ins :: a :- Enum (Dict a) Source #

a :=> (Bounded (Dict a)) 
Instance details

Defined in Data.Constraint

Methods

ins :: a :- Bounded (Dict a) Source #

() :=> (Eq (Dict a)) 
Instance details

Defined in Data.Constraint

Methods

ins :: () :- Eq (Dict a) Source #

() :=> (Ord (Dict a)) 
Instance details

Defined in Data.Constraint

Methods

ins :: () :- Ord (Dict a) Source #

() :=> (Show (Dict a)) 
Instance details

Defined in Data.Constraint

Methods

ins :: () :- Show (Dict a) Source #

() :=> (Semigroup (Dict a)) 
Instance details

Defined in Data.Constraint

Methods

ins :: () :- Semigroup (Dict a) Source #

a => Bounded (Dict a) 
Instance details

Defined in Data.Constraint

a => Enum (Dict a) 
Instance details

Defined in Data.Constraint

Methods

succ :: Dict a -> Dict a Source #

pred :: Dict a -> Dict a Source #

toEnum :: Int -> Dict a Source #

fromEnum :: Dict a -> Int Source #

enumFrom :: Dict a -> [Dict a] Source #

enumFromThen :: Dict a -> Dict a -> [Dict a] Source #

enumFromTo :: Dict a -> Dict a -> [Dict a] Source #

enumFromThenTo :: Dict a -> Dict a -> Dict a -> [Dict a] Source #

Eq (Dict a) 
Instance details

Defined in Data.Constraint

Methods

(==) :: Dict a -> Dict a -> Bool Source #

(/=) :: Dict a -> Dict a -> Bool Source #

(Typeable p, p) => Data (Dict p) 
Instance details

Defined in Data.Constraint

Methods

gfoldl :: (forall d b. Data d => c (d -> b) -> d -> c b) -> (forall g. g -> c g) -> Dict p -> c (Dict p) Source #

gunfold :: (forall b r. Data b => c (b -> r) -> c r) -> (forall r. r -> c r) -> Constr -> c (Dict p) Source #

toConstr :: Dict p -> Constr Source #

dataTypeOf :: Dict p -> DataType Source #

dataCast1 :: Typeable t => (forall d. Data d => c (t d)) -> Maybe (c (Dict p)) Source #

dataCast2 :: Typeable t => (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c (Dict p)) Source #

gmapT :: (forall b. Data b => b -> b) -> Dict p -> Dict p Source #

gmapQl :: (r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Dict p -> r Source #

gmapQr :: forall r r'. (r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Dict p -> r Source #

gmapQ :: (forall d. Data d => d -> u) -> Dict p -> [u] Source #

gmapQi :: Int -> (forall d. Data d => d -> u) -> Dict p -> u Source #

gmapM :: Monad m => (forall d. Data d => d -> m d) -> Dict p -> m (Dict p) Source #

gmapMp :: MonadPlus m => (forall d. Data d => d -> m d) -> Dict p -> m (Dict p) Source #

gmapMo :: MonadPlus m => (forall d. Data d => d -> m d) -> Dict p -> m (Dict p) Source #

Ord (Dict a) 
Instance details

Defined in Data.Constraint

Methods

compare :: Dict a -> Dict a -> Ordering Source #

(<) :: Dict a -> Dict a -> Bool Source #

(<=) :: Dict a -> Dict a -> Bool Source #

(>) :: Dict a -> Dict a -> Bool Source #

(>=) :: Dict a -> Dict a -> Bool Source #

max :: Dict a -> Dict a -> Dict a Source #

min :: Dict a -> Dict a -> Dict a Source #

a => Read (Dict a) 
Instance details

Defined in Data.Constraint

Show (Dict a) 
Instance details

Defined in Data.Constraint

Methods

showsPrec :: Int -> Dict a -> ShowS Source #

show :: Dict a -> String Source #

showList :: [Dict a] -> ShowS Source #

Semigroup (Dict a) 
Instance details

Defined in Data.Constraint

Methods

(<>) :: Dict a -> Dict a -> Dict a Source #

sconcat :: NonEmpty (Dict a) -> Dict a Source #

stimes :: Integral b => b -> Dict a -> Dict a Source #

a => Monoid (Dict a) 
Instance details

Defined in Data.Constraint

Methods

mempty :: Dict a Source #

mappend :: Dict a -> Dict a -> Dict a Source #

mconcat :: [Dict a] -> Dict a Source #

NFData (Dict c) 
Instance details

Defined in Data.Constraint

Methods

rnf :: Dict c -> () Source #