MUDA
Loading...
Searching...
No Matches
kernel_fill.inl
1#include <muda/type_traits/type_label.h>
2#include <muda/launch/memory.h>
5#include <muda/buffer/buffer_2d_view.h>
6#include <muda/buffer/buffer_3d_view.h>
7
8namespace muda::details::buffer
9{
10// fill 0D
11template <typename T>
12MUDA_INLINE MUDA_HOST void kernel_fill(cudaStream_t stream, VarView<T> dst, const T& val)
13{
14 // workaround for nvcc requirement
15 auto kernel = [dst, val] __device__(int i) mutable { *dst.data() = val; };
16
17 if constexpr(muda::is_trivially_copy_assignable_v<T>)
18 {
19 Memory(stream).upload(dst.data(), &val, sizeof(T));
20 }
21 else
22 {
23 ParallelFor(1, 1, 0, stream).apply(1, kernel);
24 }
25}
26
27// fill 1D
28template <typename T>
29MUDA_INLINE MUDA_HOST void kernel_fill(
30 int grid_dim, int block_dim, cudaStream_t stream, BufferView<T> dst, const T& val)
31{
32 ParallelFor(grid_dim, block_dim, 0, stream)
33 .apply(dst.size(),
34 [dst, val] __device__(int i) mutable { *dst.data(i) = val; });
35}
36
37// fill 2D
38template <typename T>
39MUDA_INLINE MUDA_HOST void kernel_fill(
40 int grid_dim, int block_dim, cudaStream_t stream, Buffer2DView<T> dst, const T& val)
41{
42 ParallelFor(grid_dim, block_dim, 0, stream)
43 .apply(dst.total_size(),
44 [dst, val] __device__(int i) mutable { *dst.data(i) = val; });
45};
46
47// fill 3D
48template <typename T>
49MUDA_INLINE MUDA_HOST void kernel_fill(
50 int grid_dim, int block_dim, cudaStream_t stream, Buffer3DView<T> dst, const T& val)
51{
52 ParallelFor(grid_dim, block_dim, 0, stream)
53 .apply(dst.total_size(),
54 [dst, val] __device__(int i) mutable { *dst.data(i) = val; });
55};
56} // namespace muda::details::buffer
A view interface for any array-like liner memory, which can be constructed from DeviceBuffer/DeviceVe...
A frequently used parallel for loop, DynamicBlockDim and GridStrideLoop strategy are provided,...