12#include <muda/launch/launch_base.h>
13#include <muda/launch/kernel_tag.h>
29 : callable(std::forward<U>(callable)),
36 template <
typename F,
typename UserTag>
39 template <
typename F,
typename UserTag>
43enum class ParallelForType : uint32_t
52 MUDA_NODISCARD MUDA_DEVICE
int active_num_in_block()
const MUDA_NOEXCEPT;
53 MUDA_NODISCARD MUDA_DEVICE
bool is_final_block()
const MUDA_NOEXCEPT;
54 MUDA_NODISCARD MUDA_DEVICE ParallelForType parallel_for_type()
const MUDA_NOEXCEPT
59 MUDA_NODISCARD MUDA_DEVICE
int total_num()
const MUDA_NOEXCEPT
63 MUDA_NODISCARD MUDA_DEVICE
operator int()
const MUDA_NOEXCEPT
68 MUDA_NODISCARD MUDA_DEVICE
int i()
const MUDA_NOEXCEPT
73 MUDA_NODISCARD MUDA_DEVICE
int batch_i()
const MUDA_NOEXCEPT
78 MUDA_NODISCARD MUDA_DEVICE
int total_batch()
const MUDA_NOEXCEPT
84 template <
typename F,
typename UserTag>
85 friend MUDA_GLOBAL
void details::parallel_for_kernel(ParallelForCallable<F> f);
87 template <
typename F,
typename UserTag>
88 friend MUDA_GLOBAL
void details::grid_stride_loop_kernel(ParallelForCallable<F> f);
90 MUDA_DEVICE
ParallelForDetails(ParallelForType type,
int i,
int total_num) MUDA_NOEXCEPT
92 m_total_num(total_num),
97 ParallelForType m_type;
99 int m_total_batch = 1;
101 int m_active_num_in_block = 0;
105using details::grid_stride_loop_kernel;
106using details::parallel_for_kernel;
119 size_t m_shared_mem_size;
122 template <
typename F>
142 MUDA_HOST
ParallelFor(
size_t shared_mem_size = 0, cudaStream_t stream =
nullptr) MUDA_NOEXCEPT
146 m_shared_mem_size(shared_mem_size)
166 MUDA_HOST
ParallelFor(
int blockDim,
size_t shared_mem_size = 0, cudaStream_t stream =
nullptr) MUDA_NOEXCEPT
169 m_block_dim(blockDim),
170 m_shared_mem_size(shared_mem_size)
194 size_t shared_mem_size = 0,
195 cudaStream_t stream =
nullptr) MUDA_NOEXCEPT
198 m_block_dim(blockDim),
199 m_shared_mem_size(shared_mem_size)
203 template <
typename F,
typename UserTag = Default>
206 template <
typename F,
typename UserTag = Default>
210 template <
typename F,
typename UserTag = Default>
211 MUDA_HOST MUDA_NODISCARD
auto as_node_parms(
int count, F&& f) -> S<NodeParms<F>>;
213 template <
typename F,
typename UserTag = Default>
214 MUDA_HOST MUDA_NODISCARD
auto as_node_parms(
int count, F&& f,
Tag<UserTag>)
217 MUDA_GENERIC MUDA_NODISCARD
static int round_up_blocks(
int count,
int block_dim) MUDA_NOEXCEPT
219 return (count + block_dim - 1) / block_dim;
223 template <
typename F,
typename UserTag>
224 MUDA_HOST
void invoke(
int count, F&& f);
226 template <
typename F,
typename UserTag>
227 MUDA_GENERIC
int calculate_block_dim(
int count)
const MUDA_NOEXCEPT;
229 MUDA_GENERIC
int calculate_grid_dim(
int count)
const MUDA_NOEXCEPT;
231 static MUDA_GENERIC
int calculate_grid_dim(
int count,
int block_dim) MUDA_NOEXCEPT;
233 MUDA_GENERIC
void check_input(
int count)
const MUDA_NOEXCEPT;
237#include "details/parallel_for.inl"
Definition kernel_node.h:15
Definition launch_base.h:86
Definition parallel_for.h:50
a frequently used parallel for loop, DynamicBlockDim and GridStrideLoop strategy are provided,...
Definition parallel_for.h:116
MUDA_HOST ParallelFor(size_t shared_mem_size=0, cudaStream_t stream=nullptr) MUDA_NOEXCEPT
Calculate grid dim automatically to cover the range, automatially choose the block size to achieve ma...
Definition parallel_for.h:142
MUDA_HOST ParallelFor(int gridDim, int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr) MUDA_NOEXCEPT
Use Gride Stride Loop to cover the range, you need mannally set the grid size and block size....
Definition parallel_for.h:192
MUDA_HOST ParallelFor(int blockDim, size_t shared_mem_size=0, cudaStream_t stream=nullptr) MUDA_NOEXCEPT
Calculate grid dim automatically to cover the range, but you need mannally set the block size.
Definition parallel_for.h:166
Definition parallel_for.h:23
Definition kernel_tag.h:6