a frequently used parallel for loop, DynamicBlockDim and GridStrideLoop strategy are provided, and can be switched seamlessly to each other. More...

#include <parallel_for.h>

Inheritance diagram for muda::ParallelFor:

Public Types
template<typename F >
using	NodeParms = KernelNodeParms< details::ParallelForCallable< raw_type_t< F > > >

Public Types inherited from muda::LaunchBase< ParallelFor >
using	derived_type = ParallelFor

Public Member Functions
MUDA_HOST	ParallelFor (size_t shared_mem_size=0, cudaStream_t stream=nullptr) MUDA_NOEXCEPT
	Calculate grid dim automatically to cover the range, automatially choose the block size to achieve max occupancy.

MUDA_HOST	ParallelFor (int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr) MUDA_NOEXCEPT
	Calculate grid dim automatically to cover the range, but you need mannally set the block size.

MUDA_HOST	ParallelFor (int gridDim, int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr) MUDA_NOEXCEPT
	Use Gride Stride Loop to cover the range, you need mannally set the grid size and block size. Gride Stride Loop: if grid_dim * block_dim < count, there will be a loop in every thread, to process multiple indices.

template<typename F , typename UserTag = Default>
MUDA_HOST ParallelFor &	apply (int count, F &&f)

template<typename F , typename UserTag = Default>
MUDA_HOST ParallelFor &	apply (int count, F &&f, Tag< UserTag >)

template<typename F , typename UserTag = Default>
MUDA_HOST MUDA_NODISCARD auto	as_node_parms (int count, F &&f) -> S< NodeParms< F > >

template<typename F , typename UserTag = Default>
MUDA_HOST MUDA_NODISCARD auto	as_node_parms (int count, F &&f, Tag< UserTag >) -> S< NodeParms< F > >

template<typename F , typename UserTag >
MUDA_HOST void	invoke (int count, F &&f)

template<typename F , typename UserTag >
MUDA_GENERIC int	calculate_block_dim (int count) const MUDA_NOEXCEPT

MUDA_GENERIC int	calculate_grid_dim (int count) const MUDA_NOEXCEPT

MUDA_GENERIC void	check_input (int count) const MUDA_NOEXCEPT

template<typename F , typename UserTag >
MUDA_INLINE MUDA_GENERIC int	calculate_block_dim (int count) const MUDA_NOEXCEPT

Public Member Functions inherited from muda::LaunchBase< ParallelFor >
MUDA_GENERIC	LaunchBase (::cudaStream_t stream) MUDA_NOEXCEPT

MUDA_GENERIC	LaunchBase (cudaStream_t stream) MUDA_NOEXCEPT

ParallelFor &	push_range (const std::string &name)

ParallelFor &	pop_range ()

ParallelFor &	kernel_name (std::string_view name)

ParallelFor &	file_line (std::string_view file, int line)

ParallelFor &	record (cudaEvent_t e, int flag=cudaEventRecordDefault)

ParallelFor &	record (ComputeGraphVar< cudaEvent_t > &e, const std::vector< ComputeGraphVarBase * > &vars)

ParallelFor &	record (ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars)

ParallelFor &	when (cudaEvent_t e, int flag=cudaEventWaitDefault)

ParallelFor &	wait (cudaEvent_t e, int flag=cudaEventWaitDefault)

ParallelFor &	wait (const ComputeGraphVar< cudaEvent_t > &e, const std::vector< ComputeGraphVarBase * > &vars)

ParallelFor &	wait (const ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars)

ParallelFor &	wait ()

ParallelFor &	callback (const std::function< void(::cudaStream_t, ::cudaError)> &callback)

Next	next (Next n)

Next	next (Args &&... args)

Public Member Functions inherited from muda::LaunchCore
MUDA_GENERIC	LaunchCore (::cudaStream_t stream) MUDA_NOEXCEPT

void	init_stream (::cudaStream_t s)

void	push_range (const std::string &name)

void	pop_range ()

void	record (cudaEvent_t e, int flag=cudaEventRecordDefault)

void	record (ComputeGraphVar< cudaEvent_t > &e, const std::vector< ComputeGraphVarBase * > &vars)

template<typename... ViewT>
void	record (ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars)

void	when (cudaEvent_t e, int flag=cudaEventWaitDefault)

void	wait (cudaEvent_t e, int flag=cudaEventWaitDefault)

void	wait (const ComputeGraphVar< cudaEvent_t > &e, const std::vector< ComputeGraphVarBase * > &vars)

template<typename... ViewT>
void	wait (const ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars)

void	wait ()

void	callback (const std::function< void(::cudaStream_t, ::cudaError)> &callback)

template<typename... ViewT>
MUDA_INLINE void	record (ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars)

template<typename... ViewT>
MUDA_INLINE void	wait (const ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars)

Static Public Member Functions
MUDA_GENERIC static MUDA_NODISCARD int	round_up_blocks (int count, int block_dim) MUDA_NOEXCEPT

static MUDA_GENERIC int	calculate_grid_dim (int count, int block_dim) MUDA_NOEXCEPT

Static Public Member Functions inherited from muda::LaunchCore
static void	kernel_name (std::string_view name)

static void	file_line (std::string_view file, int line)

static void	wait_event (cudaEvent_t event)

static void	wait_stream (::cudaStream_t stream)

static void	wait_device ()

Additional Inherited Members
Protected Types inherited from muda::LaunchCore
template<typename T >
using	S = std::shared_ptr< T >

Protected Member Functions inherited from muda::LaunchBase< ParallelFor >
ParallelFor &	pop_kernel_label ()

Protected Member Functions inherited from muda::LaunchCore
MUDA_GENERIC::cudaStream_t	stream () const

MUDA_HOST void	pop_kernel_label ()

Protected Attributes inherited from muda::LaunchCore
::cudaStream_t	m_stream

Detailed Description

a frequently used parallel for loop, DynamicBlockDim and GridStrideLoop strategy are provided, and can be switched seamlessly to each other.

Constructor & Destructor Documentation

◆ ParallelFor() [1/3]

MUDA_HOST muda::ParallelFor::ParallelFor	(	size_t	shared_mem_size = `0`,
		cudaStream_t	stream = `nullptr`
	)

inline

Calculate grid dim automatically to cover the range, automatially choose the block size to achieve max occupancy.

DeviceBuffer<int> buffer(256);
ParallelFor()
    .kernel_name("set_buffer") // optional
    .apply(buffer.size(), 
        [
            buffer = buffer.viewer().name("buffer") // name is optional
        ] __device__(int i) mutable 
        {
            buffer(i) = 1;
        });

◆ ParallelFor() [2/3]

MUDA_HOST muda::ParallelFor::ParallelFor	(	int	blockDim,
		size_t	shared_mem_size = `0`,
		cudaStream_t	stream = `nullptr`
	)

inline

Calculate grid dim automatically to cover the range, but you need mannally set the block size.

DeviceBuffer<int> buffer(256);
ParallelFor(64)
    .kernel_name("set_buffer") // optional
    .apply(buffer.size(), 
        [
            buffer = buffer.viewer().name("buffer") // name is optional
        ] __device__(int i) mutable 
        {
            buffer(i) = 1;
        });

◆ ParallelFor() [3/3]

MUDA_HOST muda::ParallelFor::ParallelFor	(	int	gridDim,
		int	blockDim,
		size_t	shared_mem_size = `0`,
		cudaStream_t	stream = `nullptr`
	)

inline

Use Gride Stride Loop to cover the range, you need mannally set the grid size and block size. Gride Stride Loop: if grid_dim * block_dim < count, there will be a loop in every thread, to process multiple indices.

DeviceBuffer<int> buffer(256);
ParallelFor(2, 64)
    .kernel_name("set_buffer") // optional
    .apply(buffer.size(), 
        [
            buffer = buffer.viewer().name("buffer") // name is optional
        ] __device__(int i) mutable 
        {
            buffer(i) = 1;
        });

The documentation for this class was generated from the following files:

src/muda/launch/parallel_for.h
src/muda/launch/details/parallel_for.inl

Public Types

Public Member Functions

Static Public Member Functions

Additional Inherited Members

Detailed Description

Constructor & Destructor Documentation

◆ ParallelFor() [1/3]

◆ ParallelFor() [2/3]

◆ ParallelFor() [3/3]