Skip to content

Class muda::ParallelFor

ClassList > muda > ParallelFor

a frequently used parallel for loop, DynamicBlockDim andGridStrideLoop strategy are provided, and can be switched seamlessly to each other.

  • #include <parallel_for.h>

Inherits the following classes: muda::LaunchBase

Public Types

Type Name
typedef KernelNodeParms< details::ParallelForCallable< raw_type_t< F > > > NodeParms

Public Types inherited from muda::LaunchBase

See muda::LaunchBase

Type Name
typedef T derived_type

Public Functions

Type Name
MUDA_HOST ParallelFor (size_t shared_mem_size=0, cudaStream_t stream=nullptr)
Calculate grid dim automatically to cover the range, automatially choose the block size to achieve max occupancy.
MUDA_HOST ParallelFor (int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr)
Calculate grid dim automatically to cover the range, but you need mannally set the block size.
MUDA_HOST ParallelFor (int gridDim, int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr)
Use Gride Stride Loop to cover the range, you need mannally set the grid size and block size. Gride Stride Loop: if grid_dim * block_dim < count, there will be a loop in every thread, to process multiple indices.
MUDA_HOST ParallelFor & apply (int count, F && f)
MUDA_HOST ParallelFor & apply (int count, F && f, Tag< UserTag >)
MUDA_HOST MUDA_NODISCARD auto as_node_parms (int count, F && f)
MUDA_HOST MUDA_NODISCARD auto as_node_parms (int count, F && f, Tag< UserTag >)
MUDA_GENERIC int calculate_block_dim (int count) const
MUDA_GENERIC int calculate_grid_dim (int count) const
MUDA_GENERIC void check_input (int count) const
MUDA_HOST void invoke (int count, F && f)

Public Functions inherited from muda::LaunchBase

See muda::LaunchBase

Type Name
MUDA_GENERIC LaunchBase (::cudaStream_t stream)
T & callback (const std::function< void(::cudaStream_t, ::cudaError)> & callback)
T & file_line (std::string_view file, int line)
T & kernel_name (std::string_view name)
Next next (Next n)
Next next (Args &&... args)
T & pop_range ()
T & push_range (const std::string & name)
T & record (cudaEvent_t e, int flag=cudaEventRecordDefault)
T & record (ComputeGraphVar< cudaEvent_t > & e, const std::vector< ComputeGraphVarBase * > & vars)
T & record (ComputeGraphVar< cudaEvent_t > & e, ComputeGraphVar< ViewT > &... vars)
T & wait (cudaEvent_t e, int flag=cudaEventWaitDefault)
T & wait (const ComputeGraphVar< cudaEvent_t > & e, const std::vector< ComputeGraphVarBase * > & vars)
T & wait (const ComputeGraphVar< cudaEvent_t > & e, ComputeGraphVar< ViewT > &... vars)
T & wait ()
T & when (cudaEvent_t e, int flag=cudaEventWaitDefault)
~LaunchBase ()

Public Functions inherited from muda::LaunchCore

See muda::LaunchCore

Type Name
MUDA_GENERIC LaunchCore (::cudaStream_t stream)
void callback (const std::function< void(::cudaStream_t, ::cudaError)> & callback)
void init_stream (::cudaStream_t s)
void pop_range ()
void push_range (const std::string & name)
void record (cudaEvent_t e, int flag=cudaEventRecordDefault)
void record (ComputeGraphVar< cudaEvent_t > & e, const std::vector< ComputeGraphVarBase * > & vars)
void record (ComputeGraphVar< cudaEvent_t > & e, ComputeGraphVar< ViewT > &... vars)
void wait (cudaEvent_t e, int flag=cudaEventWaitDefault)
void wait (const ComputeGraphVar< cudaEvent_t > & e, const std::vector< ComputeGraphVarBase * > & vars)
void wait (const ComputeGraphVar< cudaEvent_t > & e, ComputeGraphVar< ViewT > &... vars)
void wait ()
void when (cudaEvent_t e, int flag=cudaEventWaitDefault)
~LaunchCore ()

Public Static Functions

Type Name
MUDA_GENERIC int calculate_grid_dim (int count, int block_dim)
MUDA_GENERIC static MUDA_NODISCARD int round_up_blocks (int count, int block_dim)

Public Static Functions inherited from muda::LaunchCore

See muda::LaunchCore

Type Name
void file_line (std::string_view file, int line)
void kernel_name (std::string_view name)
void wait_device ()
void wait_event (cudaEvent_t event)
void wait_stream (::cudaStream_t stream)

Protected Types inherited from muda::LaunchCore

See muda::LaunchCore

Type Name
typedef std::shared_ptr< T > S

Protected Attributes inherited from muda::LaunchCore

See muda::LaunchCore

Type Name
::cudaStream_t m_stream

Protected Functions inherited from muda::LaunchBase

See muda::LaunchBase

Type Name
T & pop_kernel_label ()

Protected Functions inherited from muda::LaunchCore

See muda::LaunchCore

Type Name
MUDA_HOST void pop_kernel_label ()
MUDA_GENERIC::cudaStream_t stream () const

Public Types Documentation

typedef NodeParms

using muda::ParallelFor::NodeParms =  KernelNodeParms<details::ParallelForCallable<raw_type_t<F> >>;

Public Functions Documentation

function ParallelFor [1/3]

Calculate grid dim automatically to cover the range, automatially choose the block size to achieve max occupancy.

inline MUDA_HOST muda::ParallelFor::ParallelFor (
    size_t shared_mem_size=0,
    cudaStream_t stream=nullptr
) 

DeviceBuffer<int> buffer(256);
ParallelFor()
    .kernel_name("set_buffer") // optional
    .apply(buffer.size(), 
        [
            buffer = buffer.viewer().name("buffer") // name is optional
        ] __device__(int i) mutable 
        {
            buffer(i) = 1;
        });

function ParallelFor [2/3]

Calculate grid dim automatically to cover the range, but you need mannally set the block size.

inline MUDA_HOST muda::ParallelFor::ParallelFor (
    int blockDim,
    size_t shared_mem_size=0,
    cudaStream_t stream=nullptr
) 

DeviceBuffer<int> buffer(256);
ParallelFor(64)
    .kernel_name("set_buffer") // optional
    .apply(buffer.size(), 
        [
            buffer = buffer.viewer().name("buffer") // name is optional
        ] __device__(int i) mutable 
        {
            buffer(i) = 1;
        });

function ParallelFor [3/3]

Use Gride Stride Loop to cover the range, you need mannally set the grid size and block size. Gride Stride Loop: if grid_dim * block_dim < count, there will be a loop in every thread, to process multiple indices.

inline MUDA_HOST muda::ParallelFor::ParallelFor (
    int gridDim,
    int blockDim,
    size_t shared_mem_size=0,
    cudaStream_t stream=nullptr
) 

DeviceBuffer<int> buffer(256);
ParallelFor(2, 64)
    .kernel_name("set_buffer") // optional
    .apply(buffer.size(), 
        [
            buffer = buffer.viewer().name("buffer") // name is optional
        ] __device__(int i) mutable 
        {
            buffer(i) = 1;
        });

function apply [1/2]

template<typename F, typename UserTag>
MUDA_HOST ParallelFor & muda::ParallelFor::apply (
    int count,
    F && f
) 

function apply [2/2]

template<typename F, typename UserTag>
MUDA_HOST ParallelFor & muda::ParallelFor::apply (
    int count,
    F && f,
    Tag < UserTag >
) 

function as_node_parms [1/2]

template<typename F, typename UserTag>
MUDA_HOST MUDA_NODISCARD auto muda::ParallelFor::as_node_parms (
    int count,
    F && f
) 

function as_node_parms [2/2]

template<typename F, typename UserTag>
MUDA_HOST MUDA_NODISCARD auto muda::ParallelFor::as_node_parms (
    int count,
    F && f,
    Tag < UserTag >
) 

function calculate_block_dim

template<typename F, typename UserTag>
MUDA_GENERIC int muda::ParallelFor::calculate_block_dim (
    int count
) const

function calculate_grid_dim [1/2]

MUDA_GENERIC int muda::ParallelFor::calculate_grid_dim (
    int count
) const

function check_input

MUDA_GENERIC void muda::ParallelFor::check_input (
    int count
) const

function invoke

template<typename F, typename UserTag>
MUDA_HOST void muda::ParallelFor::invoke (
    int count,
    F && f
) 

Public Static Functions Documentation

function calculate_grid_dim [2/2]

static MUDA_GENERIC int muda::ParallelFor::calculate_grid_dim (
    int count,
    int block_dim
) 

function round_up_blocks

static inline MUDA_GENERIC static MUDA_NODISCARD int muda::ParallelFor::round_up_blocks (
    int count,
    int block_dim
) 


The documentation for this class was generated from the following file src/muda/launch/parallel_for.h