MUDA
Loading...
Searching...
No Matches
kernel_copy_construct.inl
1#include <muda/type_traits/type_label.h>
2#include <muda/launch/memory.h>
5#include <muda/buffer/buffer_2d_view.h>
6#include <muda/buffer/buffer_3d_view.h>
7#include <muda/buffer/buffer_info_accessor.h>
8
9namespace muda::details::buffer
10{
11// copy construct 0D
12template <typename T>
13MUDA_INLINE MUDA_HOST void kernel_copy_construct(cudaStream_t stream,
14 VarView<T> dst,
15 CVarView<T> src)
16{
17 ParallelFor(1, 1, 0, stream)
18 .apply(1,
19 [dst, src] __device__(int i) mutable
20 { new(dst.data()) T(*src.data()); });
21}
22
23template <typename T>
24MUDA_INLINE MUDA_HOST void kernel_copy_construct_non_trivial(int grid_dim,
25 int block_dim,
26 cudaStream_t stream,
27 BufferView<T>& dst,
28 CBufferView<T>& src)
29{
30 ParallelFor(grid_dim, block_dim, 0, stream)
31 .apply(dst.size(),
32 [dst, src] __device__(int i) mutable
33 { new(dst.data(i)) T(*src.data(i)); });
34}
35
36// copy construct 1D
37template <typename T>
38MUDA_INLINE MUDA_HOST void kernel_copy_construct(int grid_dim,
39 int block_dim,
40 cudaStream_t stream,
41 BufferView<T> dst,
42 CBufferView<T> src)
43{
44 if constexpr(muda::is_trivially_copy_constructible_v<T>)
45 {
46 // trivially copy constructible, use cudaMemcpy
47 Memory(stream).transfer(dst.data(), src.data(), dst.size() * sizeof(T));
48 }
49 else
50 {
51 kernel_copy_construct_non_trivial(grid_dim, block_dim, stream, dst, src);
52 }
53}
54
55template <typename T>
56MUDA_INLINE MUDA_HOST void kernel_copy_construct_non_trivial(int grid_dim,
57 int block_dim,
58 cudaStream_t stream,
59 Buffer2DView<T>& dst,
60 CBuffer2DView<T>& src)
61{
62 ParallelFor(grid_dim, block_dim, 0, stream)
63 .apply(dst.total_size(),
64 [dst, src] __device__(int i) mutable
65 { new(dst.data(i)) T(*src.data(i)); });
66}
67
68// copy construct 2D
69template <typename T>
70MUDA_INLINE MUDA_HOST void kernel_copy_construct(int grid_dim,
71 int block_dim,
72 cudaStream_t stream,
73 Buffer2DView<T> dst,
74 CBuffer2DView<T> src)
75{
76 if constexpr(muda::is_trivially_copy_constructible_v<T>)
77 {
78 // trivially copy constructible, use cudaMemcpy
79 cudaMemcpy3DParms parms = {0};
80 parms.srcPtr =
81 details::buffer::BufferInfoAccessor::template cuda_pitched_ptr(src);
82 parms.srcPos = src.offset().template cuda_pos<T>();
83 parms.dstPtr =
84 details::buffer::BufferInfoAccessor::template cuda_pitched_ptr(dst);
85 parms.extent = dst.extent().template cuda_extent<T>();
86 parms.dstPos = dst.offset().template cuda_pos<T>();
87
88 Memory(stream).transfer(parms);
89 }
90 else
91 {
92 // non-trivially copy constructible, use placement new
93 kernel_copy_construct_non_trivial(grid_dim, block_dim, stream, dst, src);
94 }
95}
96
97template <typename T>
98MUDA_INLINE MUDA_HOST void kernel_copy_construct_non_trivial(int grid_dim,
99 int block_dim,
100 cudaStream_t stream,
101 Buffer3DView<T>& dst,
102 CBuffer3DView<T>& src)
103{
104 ParallelFor(grid_dim, block_dim, 0, stream)
105 .apply(dst.total_size(),
106 [dst, src] __device__(int i) mutable
107 { new(dst.data(i)) T(*src.data(i)); });
108}
109
110// copy construct 3D
111template <typename T>
112MUDA_INLINE MUDA_HOST void kernel_copy_construct(int grid_dim,
113 int block_dim,
114 cudaStream_t stream,
115 Buffer3DView<T> dst,
116 CBuffer3DView<T> src)
117{
118 if constexpr(muda::is_trivially_copy_constructible_v<T>)
119 {
120 // trivially copy constructible, use cudaMemcpy
121 cudaMemcpy3DParms parms = {0};
122 parms.srcPtr =
123 details::buffer::BufferInfoAccessor::template cuda_pitched_ptr(src);
124 parms.srcPos = src.offset().template cuda_pos<T>();
125 parms.dstPtr =
126 details::buffer::BufferInfoAccessor::template cuda_pitched_ptr(dst);
127 parms.extent = dst.extent().template cuda_extent<T>();
128 parms.dstPos = dst.offset().template cuda_pos<T>();
129
130 Memory(stream).transfer(parms);
131 }
132 else
133 {
134 // non-trivially copy constructible, use placement new
135 kernel_copy_construct_non_trivial(grid_dim, block_dim, stream, dst, src);
136 }
137}
138} // namespace muda::details::buffer
A view interface for any array-like liner memory, which can be constructed from DeviceBuffer/DeviceVe...
A frequently used parallel for loop, DynamicBlockDim and GridStrideLoop strategy are provided,...