Class muda::ParallelFor
ClassList > muda > ParallelFor
a frequently used parallel for loop, DynamicBlockDim andGridStrideLoop strategy are provided, and can be switched seamlessly to each other.
#include <parallel_for.h>
Inherits the following classes: muda::LaunchBase
Public Types
Type | Name |
---|---|
typedef KernelNodeParms< details::ParallelForCallable< raw_type_t< F > > > | NodeParms |
Public Types inherited from muda::LaunchBase
See muda::LaunchBase
Type | Name |
---|---|
typedef T | derived_type |
Public Functions
Type | Name |
---|---|
MUDA_HOST | ParallelFor (size_t shared_mem_size=0, cudaStream_t stream=nullptr) Calculate grid dim automatically to cover the range, automatially choose the block size to achieve max occupancy. |
MUDA_HOST | ParallelFor (int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr) Calculate grid dim automatically to cover the range, but you need mannally set the block size. |
MUDA_HOST | ParallelFor (int gridDim, int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr) Use Gride Stride Loop to cover the range, you need mannally set the grid size and block size. Gride Stride Loop: if grid_dim * block_dim < count, there will be a loop in every thread, to process multiple indices. |
MUDA_HOST ParallelFor & | apply (int count, F && f) |
MUDA_HOST ParallelFor & | apply (int count, F && f, Tag< UserTag >) |
MUDA_HOST MUDA_NODISCARD auto | as_node_parms (int count, F && f) |
MUDA_HOST MUDA_NODISCARD auto | as_node_parms (int count, F && f, Tag< UserTag >) |
MUDA_GENERIC int | calculate_block_dim (int count) const |
MUDA_GENERIC int | calculate_grid_dim (int count) const |
MUDA_GENERIC void | check_input (int count) const |
MUDA_HOST void | invoke (int count, F && f) |
Public Functions inherited from muda::LaunchBase
See muda::LaunchBase
Type | Name |
---|---|
MUDA_GENERIC | LaunchBase (::cudaStream_t stream) |
T & | callback (const std::function< void(::cudaStream_t, ::cudaError)> & callback) |
T & | file_line (std::string_view file, int line) |
T & | kernel_name (std::string_view name) |
Next | next (Next n) |
Next | next (Args &&... args) |
T & | pop_range () |
T & | push_range (const std::string & name) |
T & | record (cudaEvent_t e, int flag=cudaEventRecordDefault) |
T & | record (ComputeGraphVar< cudaEvent_t > & e, const std::vector< ComputeGraphVarBase * > & vars) |
T & | record (ComputeGraphVar< cudaEvent_t > & e, ComputeGraphVar< ViewT > &... vars) |
T & | wait (cudaEvent_t e, int flag=cudaEventWaitDefault) |
T & | wait (const ComputeGraphVar< cudaEvent_t > & e, const std::vector< ComputeGraphVarBase * > & vars) |
T & | wait (const ComputeGraphVar< cudaEvent_t > & e, ComputeGraphVar< ViewT > &... vars) |
T & | wait () |
T & | when (cudaEvent_t e, int flag=cudaEventWaitDefault) |
~LaunchBase () |
Public Functions inherited from muda::LaunchCore
See muda::LaunchCore
Type | Name |
---|---|
MUDA_GENERIC | LaunchCore (::cudaStream_t stream) |
void | callback (const std::function< void(::cudaStream_t, ::cudaError)> & callback) |
void | init_stream (::cudaStream_t s) |
void | pop_range () |
void | push_range (const std::string & name) |
void | record (cudaEvent_t e, int flag=cudaEventRecordDefault) |
void | record (ComputeGraphVar< cudaEvent_t > & e, const std::vector< ComputeGraphVarBase * > & vars) |
void | record (ComputeGraphVar< cudaEvent_t > & e, ComputeGraphVar< ViewT > &... vars) |
void | wait (cudaEvent_t e, int flag=cudaEventWaitDefault) |
void | wait (const ComputeGraphVar< cudaEvent_t > & e, const std::vector< ComputeGraphVarBase * > & vars) |
void | wait (const ComputeGraphVar< cudaEvent_t > & e, ComputeGraphVar< ViewT > &... vars) |
void | wait () |
void | when (cudaEvent_t e, int flag=cudaEventWaitDefault) |
~LaunchCore () |
Public Static Functions
Type | Name |
---|---|
MUDA_GENERIC int | calculate_grid_dim (int count, int block_dim) |
MUDA_GENERIC static MUDA_NODISCARD int | round_up_blocks (int count, int block_dim) |
Public Static Functions inherited from muda::LaunchCore
See muda::LaunchCore
Type | Name |
---|---|
void | file_line (std::string_view file, int line) |
void | kernel_name (std::string_view name) |
void | wait_device () |
void | wait_event (cudaEvent_t event) |
void | wait_stream (::cudaStream_t stream) |
Protected Types inherited from muda::LaunchCore
See muda::LaunchCore
Type | Name |
---|---|
typedef std::shared_ptr< T > | S |
Protected Attributes inherited from muda::LaunchCore
See muda::LaunchCore
Type | Name |
---|---|
::cudaStream_t | m_stream |
Protected Functions inherited from muda::LaunchBase
See muda::LaunchBase
Type | Name |
---|---|
T & | pop_kernel_label () |
Protected Functions inherited from muda::LaunchCore
See muda::LaunchCore
Type | Name |
---|---|
MUDA_HOST void | pop_kernel_label () |
MUDA_GENERIC::cudaStream_t | stream () const |
Public Types Documentation
typedef NodeParms
Public Functions Documentation
function ParallelFor [1/3]
Calculate grid dim automatically to cover the range, automatially choose the block size to achieve max occupancy.
inline MUDA_HOST muda::ParallelFor::ParallelFor (
size_t shared_mem_size=0,
cudaStream_t stream=nullptr
)
DeviceBuffer<int> buffer(256);
ParallelFor()
.kernel_name("set_buffer") // optional
.apply(buffer.size(),
[
buffer = buffer.viewer().name("buffer") // name is optional
] __device__(int i) mutable
{
buffer(i) = 1;
});
function ParallelFor [2/3]
Calculate grid dim automatically to cover the range, but you need mannally set the block size.
inline MUDA_HOST muda::ParallelFor::ParallelFor (
int blockDim,
size_t shared_mem_size=0,
cudaStream_t stream=nullptr
)
DeviceBuffer<int> buffer(256);
ParallelFor(64)
.kernel_name("set_buffer") // optional
.apply(buffer.size(),
[
buffer = buffer.viewer().name("buffer") // name is optional
] __device__(int i) mutable
{
buffer(i) = 1;
});
function ParallelFor [3/3]
Use Gride Stride Loop to cover the range, you need mannally set the grid size and block size. Gride Stride Loop: if grid_dim * block_dim < count, there will be a loop in every thread, to process multiple indices.
inline MUDA_HOST muda::ParallelFor::ParallelFor (
int gridDim,
int blockDim,
size_t shared_mem_size=0,
cudaStream_t stream=nullptr
)
DeviceBuffer<int> buffer(256);
ParallelFor(2, 64)
.kernel_name("set_buffer") // optional
.apply(buffer.size(),
[
buffer = buffer.viewer().name("buffer") // name is optional
] __device__(int i) mutable
{
buffer(i) = 1;
});
function apply [1/2]
template<typename F, typename UserTag>
MUDA_HOST ParallelFor & muda::ParallelFor::apply (
int count,
F && f
)
function apply [2/2]
template<typename F, typename UserTag>
MUDA_HOST ParallelFor & muda::ParallelFor::apply (
int count,
F && f,
Tag < UserTag >
)
function as_node_parms [1/2]
template<typename F, typename UserTag>
MUDA_HOST MUDA_NODISCARD auto muda::ParallelFor::as_node_parms (
int count,
F && f
)
function as_node_parms [2/2]
template<typename F, typename UserTag>
MUDA_HOST MUDA_NODISCARD auto muda::ParallelFor::as_node_parms (
int count,
F && f,
Tag < UserTag >
)
function calculate_block_dim
template<typename F, typename UserTag>
MUDA_GENERIC int muda::ParallelFor::calculate_block_dim (
int count
) const
function calculate_grid_dim [1/2]
function check_input
function invoke
template<typename F, typename UserTag>
MUDA_HOST void muda::ParallelFor::invoke (
int count,
F && f
)
Public Static Functions Documentation
function calculate_grid_dim [2/2]
function round_up_blocks
static inline MUDA_GENERIC static MUDA_NODISCARD int muda::ParallelFor::round_up_blocks (
int count,
int block_dim
)
The documentation for this class was generated from the following file src/muda/launch/parallel_for.h