Class muda::ParallelFor

ClassList > muda > ParallelFor

a frequently used parallel for loop, DynamicBlockDim andGridStrideLoop strategy are provided, and can be switched seamlessly to each other.

#include <parallel_for.h>

Inherits the following classes: muda::LaunchBase

Public Types

Type	Name
typedef KernelNodeParms< details::ParallelForCallable< raw_type_t< F > > >	NodeParms

Public Types inherited from muda::LaunchBase

See muda::LaunchBase

Type	Name
typedef T	derived_type

Public Functions

Type	Name
MUDA_HOST	ParallelFor (size_t shared_mem_size=0, cudaStream_t stream=nullptr) Calculate grid dim automatically to cover the range, automatially choose the block size to achieve max occupancy.
MUDA_HOST	ParallelFor (int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr) Calculate grid dim automatically to cover the range, but you need mannally set the block size.
MUDA_HOST	ParallelFor (int gridDim, int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr) Use Gride Stride Loop to cover the range, you need mannally set the grid size and block size. Gride Stride Loop: if grid_dim * block_dim < count, there will be a loop in every thread, to process multiple indices.
MUDA_HOST ParallelFor &	apply (int count, F && f)
MUDA_HOST ParallelFor &	apply (int count, F && f, Tag< UserTag >)
MUDA_HOST MUDA_NODISCARD auto	as_node_parms (int count, F && f)
MUDA_HOST MUDA_NODISCARD auto	as_node_parms (int count, F && f, Tag< UserTag >)
MUDA_GENERIC int	calculate_block_dim (int count) const
MUDA_GENERIC int	calculate_grid_dim (int count) const
MUDA_GENERIC void	check_input (int count) const
MUDA_HOST void	invoke (int count, F && f)

Public Functions inherited from muda::LaunchBase

See muda::LaunchBase

Type	Name
MUDA_GENERIC	LaunchBase (::cudaStream_t stream)
T &	callback (const std::function< void(::cudaStream_t, ::cudaError)> & callback)
T &	file_line (std::string_view file, int line)
T &	kernel_name (std::string_view name)
Next	next (Next n)
Next	next (Args &&... args)
T &	pop_range ()
T &	push_range (const std::string & name)
T &	record (cudaEvent_t e, int flag=cudaEventRecordDefault)
T &	record (ComputeGraphVar< cudaEvent_t > & e, const std::vector< ComputeGraphVarBase * > & vars)
T &	record (ComputeGraphVar< cudaEvent_t > & e, ComputeGraphVar< ViewT > &... vars)
T &	wait (cudaEvent_t e, int flag=cudaEventWaitDefault)
T &	wait (const ComputeGraphVar< cudaEvent_t > & e, const std::vector< ComputeGraphVarBase * > & vars)
T &	wait (const ComputeGraphVar< cudaEvent_t > & e, ComputeGraphVar< ViewT > &... vars)
T &	wait ()
T &	when (cudaEvent_t e, int flag=cudaEventWaitDefault)
	~LaunchBase ()

Public Functions inherited from muda::LaunchCore

See muda::LaunchCore

Type	Name
MUDA_GENERIC	LaunchCore (::cudaStream_t stream)
void	callback (const std::function< void(::cudaStream_t, ::cudaError)> & callback)
void	init_stream (::cudaStream_t s)
void	pop_range ()
void	push_range (const std::string & name)
void	record (cudaEvent_t e, int flag=cudaEventRecordDefault)
void	record (ComputeGraphVar< cudaEvent_t > & e, const std::vector< ComputeGraphVarBase * > & vars)
void	record (ComputeGraphVar< cudaEvent_t > & e, ComputeGraphVar< ViewT > &... vars)
void	wait (cudaEvent_t e, int flag=cudaEventWaitDefault)
void	wait (const ComputeGraphVar< cudaEvent_t > & e, const std::vector< ComputeGraphVarBase * > & vars)
void	wait (const ComputeGraphVar< cudaEvent_t > & e, ComputeGraphVar< ViewT > &... vars)
void	wait ()
void	when (cudaEvent_t e, int flag=cudaEventWaitDefault)
	~LaunchCore ()

Public Static Functions

Type	Name
MUDA_GENERIC int	calculate_grid_dim (int count, int block_dim)
MUDA_GENERIC static MUDA_NODISCARD int	round_up_blocks (int count, int block_dim)

Public Static Functions inherited from muda::LaunchCore

See muda::LaunchCore

Type	Name
void	file_line (std::string_view file, int line)
void	kernel_name (std::string_view name)
void	wait_device ()
void	wait_event (cudaEvent_t event)
void	wait_stream (::cudaStream_t stream)

Protected Types inherited from muda::LaunchCore

See muda::LaunchCore

Type	Name
typedef std::shared_ptr< T >	S

Protected Attributes inherited from muda::LaunchCore

See muda::LaunchCore

Type	Name
::cudaStream_t	m_stream

Protected Functions inherited from muda::LaunchBase

See muda::LaunchBase

Type	Name
T &	pop_kernel_label ()

Protected Functions inherited from muda::LaunchCore

See muda::LaunchCore

Type	Name
MUDA_HOST void	pop_kernel_label ()
MUDA_GENERIC::cudaStream_t	stream () const

Public Types Documentation

typedef NodeParms

using muda::ParallelFor::NodeParms =  KernelNodeParms<details::ParallelForCallable<raw_type_t<F>>>;

Public Functions Documentation

function ParallelFor [1/3]

Calculate grid dim automatically to cover the range, automatially choose the block size to achieve max occupancy.

inline MUDA_HOST muda::ParallelFor::ParallelFor (
    size_t shared_mem_size=0,
    cudaStream_t stream=nullptr
)

DeviceBuffer<int> buffer(256);
ParallelFor()
    .kernel_name("set_buffer") // optional
    .apply(buffer.size(), 
        [
            buffer = buffer.viewer().name("buffer") // name is optional
        ] __device__(int i) mutable 
        {
            buffer(i) = 1;
        });

function ParallelFor [2/3]

Calculate grid dim automatically to cover the range, but you need mannally set the block size.

inline MUDA_HOST muda::ParallelFor::ParallelFor (
    int blockDim,
    size_t shared_mem_size=0,
    cudaStream_t stream=nullptr
)

DeviceBuffer<int> buffer(256);
ParallelFor(64)
    .kernel_name("set_buffer") // optional
    .apply(buffer.size(), 
        [
            buffer = buffer.viewer().name("buffer") // name is optional
        ] __device__(int i) mutable 
        {
            buffer(i) = 1;
        });

function ParallelFor [3/3]

Use Gride Stride Loop to cover the range, you need mannally set the grid size and block size. Gride Stride Loop: if grid_dim * block_dim < count, there will be a loop in every thread, to process multiple indices.

inline MUDA_HOST muda::ParallelFor::ParallelFor (
    int gridDim,
    int blockDim,
    size_t shared_mem_size=0,
    cudaStream_t stream=nullptr
)

DeviceBuffer<int> buffer(256);
ParallelFor(2, 64)
    .kernel_name("set_buffer") // optional
    .apply(buffer.size(), 
        [
            buffer = buffer.viewer().name("buffer") // name is optional
        ] __device__(int i) mutable 
        {
            buffer(i) = 1;
        });

function apply [1/2]

template<typename F, typename UserTag>
MUDA_HOST ParallelFor & muda::ParallelFor::apply (
    int count,
    F && f
)

function apply [2/2]

template<typename F, typename UserTag>
MUDA_HOST ParallelFor & muda::ParallelFor::apply (
    int count,
    F && f,
    Tag < UserTag >
)

function as_node_parms [1/2]

template<typename F, typename UserTag>
MUDA_HOST MUDA_NODISCARD auto muda::ParallelFor::as_node_parms (
    int count,
    F && f
)

function as_node_parms [2/2]

template<typename F, typename UserTag>
MUDA_HOST MUDA_NODISCARD auto muda::ParallelFor::as_node_parms (
    int count,
    F && f,
    Tag < UserTag >
)

function calculate_block_dim

template<typename F, typename UserTag>
MUDA_GENERIC int muda::ParallelFor::calculate_block_dim (
    int count
) const

function calculate_grid_dim [1/2]

MUDA_GENERIC int muda::ParallelFor::calculate_grid_dim (
    int count
) const

function check_input

MUDA_GENERIC void muda::ParallelFor::check_input (
    int count
) const

function invoke

template<typename F, typename UserTag>
MUDA_HOST void muda::ParallelFor::invoke (
    int count,
    F && f
)

Public Static Functions Documentation

function calculate_grid_dim [2/2]

static MUDA_GENERIC int muda::ParallelFor::calculate_grid_dim (
    int count,
    int block_dim
)

function round_up_blocks

static inline MUDA_GENERIC static MUDA_NODISCARD int muda::ParallelFor::round_up_blocks (
    int count,
    int block_dim
)

The documentation for this class was generated from the following file src/muda/launch/parallel_for.h