|
MUDA_HOST | ParallelFor (size_t shared_mem_size=0, cudaStream_t stream=nullptr) MUDA_NOEXCEPT |
| Calculate grid dim automatically to cover the range, automatially choose the block size to achieve max occupancy.
|
|
MUDA_HOST | ParallelFor (int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr) MUDA_NOEXCEPT |
| Calculate grid dim automatically to cover the range, but you need mannally set the block size.
|
|
MUDA_HOST | ParallelFor (int gridDim, int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr) MUDA_NOEXCEPT |
| Use Gride Stride Loop to cover the range, you need mannally set the grid size and block size. Gride Stride Loop: if grid_dim * block_dim < count, there will be a loop in every thread, to process multiple indices.
|
|
template<typename F , typename UserTag = Default> |
MUDA_HOST ParallelFor & | apply (int count, F &&f) |
|
template<typename F , typename UserTag = Default> |
MUDA_HOST ParallelFor & | apply (int count, F &&f, Tag< UserTag >) |
|
template<typename F , typename UserTag = Default> |
MUDA_HOST MUDA_NODISCARD auto | as_node_parms (int count, F &&f) -> S< NodeParms< F > > |
|
template<typename F , typename UserTag = Default> |
MUDA_HOST MUDA_NODISCARD auto | as_node_parms (int count, F &&f, Tag< UserTag >) -> S< NodeParms< F > > |
|
template<typename F , typename UserTag > |
MUDA_HOST void | invoke (int count, F &&f) |
|
template<typename F , typename UserTag > |
MUDA_GENERIC int | calculate_block_dim (int count) const MUDA_NOEXCEPT |
|
MUDA_GENERIC int | calculate_grid_dim (int count) const MUDA_NOEXCEPT |
|
MUDA_GENERIC void | check_input (int count) const MUDA_NOEXCEPT |
|
template<typename F , typename UserTag > |
MUDA_INLINE MUDA_GENERIC int | calculate_block_dim (int count) const MUDA_NOEXCEPT |
|
MUDA_GENERIC | LaunchBase (::cudaStream_t stream) MUDA_NOEXCEPT |
|
MUDA_GENERIC | LaunchBase (cudaStream_t stream) MUDA_NOEXCEPT |
|
ParallelFor & | push_range (const std::string &name) |
|
ParallelFor & | pop_range () |
|
ParallelFor & | kernel_name (std::string_view name) |
|
ParallelFor & | file_line (std::string_view file, int line) |
|
ParallelFor & | record (cudaEvent_t e, int flag=cudaEventRecordDefault) |
|
ParallelFor & | record (ComputeGraphVar< cudaEvent_t > &e, const std::vector< ComputeGraphVarBase * > &vars) |
|
ParallelFor & | record (ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars) |
|
ParallelFor & | when (cudaEvent_t e, int flag=cudaEventWaitDefault) |
|
ParallelFor & | wait (cudaEvent_t e, int flag=cudaEventWaitDefault) |
|
ParallelFor & | wait (const ComputeGraphVar< cudaEvent_t > &e, const std::vector< ComputeGraphVarBase * > &vars) |
|
ParallelFor & | wait (const ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars) |
|
ParallelFor & | wait () |
|
ParallelFor & | callback (const std::function< void(::cudaStream_t, ::cudaError)> &callback) |
|
Next | next (Next n) |
|
Next | next (Args &&... args) |
|
MUDA_GENERIC | LaunchCore (::cudaStream_t stream) MUDA_NOEXCEPT |
|
void | init_stream (::cudaStream_t s) |
|
void | push_range (const std::string &name) |
|
void | pop_range () |
|
void | record (cudaEvent_t e, int flag=cudaEventRecordDefault) |
|
void | record (ComputeGraphVar< cudaEvent_t > &e, const std::vector< ComputeGraphVarBase * > &vars) |
|
template<typename... ViewT> |
void | record (ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars) |
|
void | when (cudaEvent_t e, int flag=cudaEventWaitDefault) |
|
void | wait (cudaEvent_t e, int flag=cudaEventWaitDefault) |
|
void | wait (const ComputeGraphVar< cudaEvent_t > &e, const std::vector< ComputeGraphVarBase * > &vars) |
|
template<typename... ViewT> |
void | wait (const ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars) |
|
void | wait () |
|
void | callback (const std::function< void(::cudaStream_t, ::cudaError)> &callback) |
|
template<typename... ViewT> |
MUDA_INLINE void | record (ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars) |
|
template<typename... ViewT> |
MUDA_INLINE void | wait (const ComputeGraphVar< cudaEvent_t > &e, ComputeGraphVar< ViewT > &... vars) |
|
a frequently used parallel for loop, DynamicBlockDim and GridStrideLoop strategy are provided, and can be switched seamlessly to each other.