MUDA
Loading...
Searching...
No Matches
muda::ParallelFor Class Reference

a frequently used parallel for loop, DynamicBlockDim and GridStrideLoop strategy are provided, and can be switched seamlessly to each other. More...

#include <parallel_for.h>

Inheritance diagram for muda::ParallelFor:
muda::LaunchBase< ParallelFor > muda::LaunchCore

Public Types

template<typename F >
using NodeParms = KernelNodeParms< details::ParallelForCallable< raw_type_t< F > > >
 
- Public Types inherited from muda::LaunchBase< ParallelFor >
using derived_type = ParallelFor
 

Public Member Functions

MUDA_HOST ParallelFor (size_t shared_mem_size=0, cudaStream_t stream=nullptr) MUDA_NOEXCEPT
 Calculate grid dim automatically to cover the range, automatially choose the block size to achieve max occupancy.
 
MUDA_HOST ParallelFor (int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr) MUDA_NOEXCEPT
 Calculate grid dim automatically to cover the range, but you need mannally set the block size.
 
MUDA_HOST ParallelFor (int gridDim, int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr) MUDA_NOEXCEPT
 Use Gride Stride Loop to cover the range, you need mannally set the grid size and block size. Gride Stride Loop: if grid_dim * block_dim < count, there will be a loop in every thread, to process multiple indices.
 
template<typename F , typename UserTag = Default>
MUDA_HOST ParallelForapply (int count, F &&f)
 
template<typename F , typename UserTag = Default>
MUDA_HOST ParallelForapply (int count, F &&f, Tag< UserTag >)
 
template<typename F , typename UserTag = Default>
MUDA_HOST MUDA_NODISCARD auto as_node_parms (int count, F &&f) -> S< NodeParms< F > >
 
template<typename F , typename UserTag = Default>
MUDA_HOST MUDA_NODISCARD auto as_node_parms (int count, F &&f, Tag< UserTag >) -> S< NodeParms< F > >
 
template<typename F , typename UserTag >
MUDA_HOST void invoke (int count, F &&f)
 
template<typename F , typename UserTag >
MUDA_GENERIC int calculate_block_dim (int count) const MUDA_NOEXCEPT
 
MUDA_GENERIC int calculate_grid_dim (int count) const MUDA_NOEXCEPT
 
MUDA_GENERIC void check_input (int count) const MUDA_NOEXCEPT
 
template<typename F , typename UserTag >
MUDA_INLINE MUDA_GENERIC int calculate_block_dim (int count) const MUDA_NOEXCEPT
 
- Public Member Functions inherited from muda::LaunchBase< ParallelFor >
MUDA_GENERIC LaunchBase (::cudaStream_t stream) MUDA_NOEXCEPT
 
MUDA_GENERIC LaunchBase (cudaStream_t stream) MUDA_NOEXCEPT
 
ParallelForpush_range (const std::string &name)
 
ParallelForpop_range ()
 
ParallelForkernel_name (std::string_view name)
 
ParallelForfile_line (std::string_view file, int line)
 
ParallelForrecord (cudaEvent_t e, int flag=cudaEventRecordDefault)
 
ParallelForrecord (ComputeGraphVar< cudaEvent_t > &e, const std::vector< ComputeGraphVarBase * > &vars)
 
ParallelForrecord (ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars)
 
ParallelForwhen (cudaEvent_t e, int flag=cudaEventWaitDefault)
 
ParallelForwait (cudaEvent_t e, int flag=cudaEventWaitDefault)
 
ParallelForwait (const ComputeGraphVar< cudaEvent_t > &e, const std::vector< ComputeGraphVarBase * > &vars)
 
ParallelForwait (const ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars)
 
ParallelForwait ()
 
ParallelForcallback (const std::function< void(::cudaStream_t, ::cudaError)> &callback)
 
Next next (Next n)
 
Next next (Args &&... args)
 
- Public Member Functions inherited from muda::LaunchCore
MUDA_GENERIC LaunchCore (::cudaStream_t stream) MUDA_NOEXCEPT
 
void init_stream (::cudaStream_t s)
 
void push_range (const std::string &name)
 
void pop_range ()
 
void record (cudaEvent_t e, int flag=cudaEventRecordDefault)
 
void record (ComputeGraphVar< cudaEvent_t > &e, const std::vector< ComputeGraphVarBase * > &vars)
 
template<typename... ViewT>
void record (ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars)
 
void when (cudaEvent_t e, int flag=cudaEventWaitDefault)
 
void wait (cudaEvent_t e, int flag=cudaEventWaitDefault)
 
void wait (const ComputeGraphVar< cudaEvent_t > &e, const std::vector< ComputeGraphVarBase * > &vars)
 
template<typename... ViewT>
void wait (const ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars)
 
void wait ()
 
void callback (const std::function< void(::cudaStream_t, ::cudaError)> &callback)
 
template<typename... ViewT>
MUDA_INLINE void record (ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars)
 
template<typename... ViewT>
MUDA_INLINE void wait (const ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars)
 

Static Public Member Functions

MUDA_GENERIC static MUDA_NODISCARD int round_up_blocks (int count, int block_dim) MUDA_NOEXCEPT
 
static MUDA_GENERIC int calculate_grid_dim (int count, int block_dim) MUDA_NOEXCEPT
 
- Static Public Member Functions inherited from muda::LaunchCore
static void kernel_name (std::string_view name)
 
static void file_line (std::string_view file, int line)
 
static void wait_event (cudaEvent_t event)
 
static void wait_stream (::cudaStream_t stream)
 
static void wait_device ()
 

Additional Inherited Members

- Protected Types inherited from muda::LaunchCore
template<typename T >
using S = std::shared_ptr< T >
 
- Protected Member Functions inherited from muda::LaunchBase< ParallelFor >
ParallelForpop_kernel_label ()
 
- Protected Member Functions inherited from muda::LaunchCore
MUDA_GENERIC::cudaStream_t stream () const
 
MUDA_HOST void pop_kernel_label ()
 
- Protected Attributes inherited from muda::LaunchCore
::cudaStream_t m_stream
 

Detailed Description

a frequently used parallel for loop, DynamicBlockDim and GridStrideLoop strategy are provided, and can be switched seamlessly to each other.

Constructor & Destructor Documentation

◆ ParallelFor() [1/3]

MUDA_HOST muda::ParallelFor::ParallelFor ( size_t  shared_mem_size = 0,
cudaStream_t  stream = nullptr 
)
inline

Calculate grid dim automatically to cover the range, automatially choose the block size to achieve max occupancy.

DeviceBuffer<int> buffer(256);
.kernel_name("set_buffer") // optional
.apply(buffer.size(),
[
buffer = buffer.viewer().name("buffer") // name is optional
] __device__(int i) mutable
{
buffer(i) = 1;
});
A std::vector like wrapper of cuda device memory, allows user to:
Definition device_buffer.h:46
a frequently used parallel for loop, DynamicBlockDim and GridStrideLoop strategy are provided,...
Definition parallel_for.h:116

◆ ParallelFor() [2/3]

MUDA_HOST muda::ParallelFor::ParallelFor ( int  blockDim,
size_t  shared_mem_size = 0,
cudaStream_t  stream = nullptr 
)
inline

Calculate grid dim automatically to cover the range, but you need mannally set the block size.

DeviceBuffer<int> buffer(256);
.kernel_name("set_buffer") // optional
.apply(buffer.size(),
[
buffer = buffer.viewer().name("buffer") // name is optional
] __device__(int i) mutable
{
buffer(i) = 1;
});

◆ ParallelFor() [3/3]

MUDA_HOST muda::ParallelFor::ParallelFor ( int  gridDim,
int  blockDim,
size_t  shared_mem_size = 0,
cudaStream_t  stream = nullptr 
)
inline

Use Gride Stride Loop to cover the range, you need mannally set the grid size and block size. Gride Stride Loop: if grid_dim * block_dim < count, there will be a loop in every thread, to process multiple indices.

DeviceBuffer<int> buffer(256);
.kernel_name("set_buffer") // optional
.apply(buffer.size(),
[
buffer = buffer.viewer().name("buffer") // name is optional
] __device__(int i) mutable
{
buffer(i) = 1;
});

The documentation for this class was generated from the following files: