MUDA
Loading...
Searching...
No Matches
sparse_spatial_hash_impl.inl
1#include <muda/cub/device/device_reduce.h>
2#include <muda/cub/device/device_radix_sort.h>
3#include <muda/cub/device/device_run_length_encode.h>
4#include <muda/cub/device/device_scan.h>
5#include <muda/cub/device/device_select.h>
6
7namespace muda::spatial_hash::details
8{
9template <typename Hash>
10inline void SparseSpatialHashImpl<Hash>::setup_hash_table()
11{
12 calculate_hash_table_basic_info();
13 fill_hash_cells();
14 count_object_per_cell();
15}
16
17template <typename Hash>
18template <typename Pred>
19inline void SparseSpatialHashImpl<Hash>::detect(CBufferView<BoundingSphere> boundingSphereList,
20 bool append,
21 DeviceBuffer<CollisionPair>& collisionPairs,
22 Pred&& pred)
23{
24 spheres = boundingSphereList;
25 setup_hash_table();
26 balanced_setup_collision_pairs(append, collisionPairs, std::forward<Pred>(pred));
27}
28
29template <typename Hash>
30void SparseSpatialHashImpl<Hash>::calculate_hash_table_basic_info()
31{
32 auto count = spheres.size();
33
34 BufferLaunch(m_stream)
35 .clear(allRadius)
36 .resize(allRadius, count)
37 .clear(allCoords)
38 .resize(allCoords, count);
39
40 constexpr auto float_max = std::numeric_limits<float>::max();
41 const Vector3 vector3_max(float_max, float_max, float_max);
42
43 ParallelFor(0, m_stream) //
44 .apply(count,
45 [spheres = spheres.cviewer().name("spheres"),
46 allRadius = allRadius.viewer().name("allRadius"),
47 allCoords = allCoords.viewer().name("allCoords"),
48 vector3_max,
49 level = this->level] __device__(int i) mutable
50 {
51 const auto& s = spheres(i);
52 allRadius(i) = s.level == level ? s.r : 0.0f;
53 allCoords(i) = s.level <= level ? s.o : vector3_max;
54 });
55
56 DeviceReduce().Max(allRadius.data(), maxRadius.data(), count);
57
58
59 DeviceReduce().Reduce(
60 allCoords.data(),
61 minCoord.data(),
62 count,
63 [] __host__ __device__(const Vector3& a, const Vector3& b) -> Vector3
64 { return a.cwiseMin(b); },
65 vector3_max);
66
67 // Scaling the bounding sphere of each object by sqrt(2) [we will use proxy spheres]
68 // and ensuring that the grid cell is at least 1.5 times
69 // as large as the scaled bounding sphere of the largest object.
70 //https://developer.nvidia.com/gpugems/gpugems3/part-v-physics-simulation/chapter-32-broad-phase-collision-detection-cuda
71
72 float maxRadius = this->maxRadius;
73 empty_level = maxRadius == 0.0f;
74
75 if(empty_level) // no object in this level
76 return;
77
78 auto scaledCellSize = maxRadius * 2 * 1.5 * 1.5;
79 h_spatialHashConfig.coord_min = minCoord;
80 // shift the coord_min by the scaledMaxRadius, which is much safer than the original maxRadius
81 h_spatialHashConfig.coord_min -= scaledCellSize * Vector3::Ones();
82 h_spatialHashConfig.cell_size = scaledCellSize;
83
84 // upload
85 spatialHashConfig = h_spatialHashConfig;
86}
87
88template <typename Hash>
89void SparseSpatialHashImpl<Hash>::fill_hash_cells()
90{
91 if(empty_level)
92 return;
93
94 using namespace muda;
95
96 int size = spheres.size();
97 int count = 8 * size + 1; //
98
99 BufferLaunch(m_stream)
100 //.clear(cellArrayValue)
101 .resize(cellArrayValue, count)
102 //.clear(cellArrayKey)
103 .resize(cellArrayKey, count)
104 //.clear(cellArrayValueSorted)
105 .resize(cellArrayValueSorted, count)
106 //.clear(cellArrayKeySorted)
107 .resize(cellArrayKeySorted, count)
108 //.clear(uniqueKey)
109 .resize(uniqueKey, count)
110 //.clear(objCountInCell)
111 .resize(objCountInCell, count)
112 //.clear(objCountInCellPrefixSum)
113 .resize(objCountInCellPrefixSum, count)
114 //.clear(collisionPairCount)
115 .resize(collisionPairCount, count)
116 //.clear(collisionPairPrefixSum)
117 .resize(collisionPairPrefixSum, count);
118
119 ParallelFor(0, m_stream) //
120 .apply(spheres.size(),
121 [spheres = spheres.cviewer(),
122 spatialHashConfig = spatialHashConfig.viewer(),
123 cellArrayValue = make_dense_2d(cellArrayValue.data(), size, 8),
124 cellArrayKey = make_dense_2d(cellArrayKey.data(), size, 8),
125 level = this->level] __device__(int i) mutable
126 {
127 BoundingSphere s = spheres(i);
128 auto& sh = *spatialHashConfig;
129
130 auto proxySphere = s;
131 // https://developer.nvidia.com/gpugems/gpugems3/part-v-physics-simulation/chapter-32-broad-phase-collision-detection-cuda
132 // Scaling the bounding sphere of each object by sqrt(2), here we take 1.5(>1.414)
133 proxySphere.r *= 1.5;
134
135 auto o = s.o;
136 Vector3u ijk = sh.cell(o);
137 auto hash = sh.hash_cell(ijk);
138 auto cellSize = sh.cell_size;
139 // print("beginFillHashCells cellSize=%f\n", cellSize);
140 auto objectId = i;
141
142
143 Cell homeCell(hash, objectId);
144
145 // ...[i*8+0][i*8+1][i*8+2][i*8+3][i*8+4][i*8+5][i*8+6][i*8+7]...
146 // ...[hcell][pcell][pcell] ... [ x ][ x ][ x ][ x ]...
147
148
149 // fill the cell that contains the center of the current sphere
150 homeCell.set_as_home(ijk);
151 // homeCell.ijk = ijk;
152 Vector3 xyz = sh.cell_center_coord(ijk);
153
154 //find the cloest 7 neighbor cells
155 Vector3i dxyz;
156#pragma unroll
157 for(int i = 0; i < 3; ++i)
158 dxyz(i) = o(i) > xyz(i) ? 1 : -1;
159
160 auto cal_cell = [&](const Vector3i& dxyz) -> Vector3u
161
162 {
163 Vector3i res = (ijk.cast<I32>() + dxyz);
164 return res.cast<U32>();
165 };
166
167 Vector3u cells[7] = {cal_cell(Vector3i(dxyz.x(), 0, 0)),
168 cal_cell(Vector3i(0, dxyz.y(), 0)),
169 cal_cell(Vector3i(0, 0, dxyz.z())),
170 cal_cell(Vector3i(0, dxyz.y(), dxyz.z())),
171 cal_cell(Vector3i(dxyz.x(), 0, dxyz.z())),
172 cal_cell(Vector3i(dxyz.x(), dxyz.y(), 0)),
173 cal_cell(dxyz)};
174
175 // the cell size (3d)
176 Vector3 size(cellSize, cellSize, cellSize);
177
178 int idx = 1; //goes from 1 -> 7. idx = 0 is for the homeCell
179#pragma unroll
180 for(int i = 0; i < 7; ++i)
181 {
182 Vector3 min = sh.coord(cells[i]);
183 Vector3 max = min + size;
184 AABB aabb(min, max);
185
186 // use proxySphere to test
187 // whether the current sphere overlaps the neighbor cell
188 if(intersect(proxySphere, aabb))
189 {
190 homeCell.set_overlap(cells[i]);
191 auto hash = sh.hash_cell(cells[i]);
192 Cell phantom(hash, objectId);
193 phantom.set_as_phantom(ijk, cells[i]);
194 // phantom.ijk = cells[i];
195 phantom.ctlbit.overlap = homeCell.ctlbit.overlap;
196 cellArrayValue(objectId, idx++) = phantom;
197 }
198 }
199
200 //set the home cell
201 cellArrayValue(objectId, 0) = homeCell;
202
203 // turn off othese non-overlap neighbor cells if we do have.
204 for(; idx < 8; ++idx)
205 cellArrayValue(objectId, idx) = Cell(-1, -1);
206
207 // fill the key for later sorting
208 for(int i = 0; i < 8; ++i)
209 cellArrayKey(objectId, i) = cellArrayValue(objectId, i).cid;
210 });
211
212 Launch(1, 1, 0, m_stream) //
213 .apply(
214 [cellArrayValue = cellArrayValue.viewer(),
215 cellArrayKey = cellArrayKey.viewer()] __device__() mutable
216 {
217 cellArrayKey(cellArrayKey.total_size() - 1) = -1;
218 cellArrayValue(cellArrayValue.total_size() - 1) = Cell(-1, -1);
219 });
220}
221
222template <typename Hash>
223void SparseSpatialHashImpl<Hash>::count_object_per_cell()
224{
225 if(empty_level)
226 return;
227
228 DeviceRadixSort(m_stream).SortPairs((uint32_t*)cellArrayKey.data(), //in
229 (uint32_t*)cellArrayKeySorted.data(), //out
230 cellArrayValue.data(), //in
231 cellArrayValueSorted.data(), //out
232 cellArrayValue.size());
233
234 auto count = uniqueKey.size();
235 DeviceRunLengthEncode(m_stream).Encode(cellArrayKeySorted.data(), // in
236 uniqueKey.data(), // out
237 objCountInCell.data(), // out
238 uniqueKeyCount.data(), // out
239 count);
240
241 int h_uniqueKeyCount = uniqueKeyCount;
242
243 BufferLaunch(m_stream)
244 .resize(uniqueKey, h_uniqueKeyCount)
245 .resize(objCountInCell, h_uniqueKeyCount)
246 .resize(objCountInCellPrefixSum, h_uniqueKeyCount)
247 .resize(collisionPairCount, h_uniqueKeyCount)
248 .resize(collisionPairPrefixSum, h_uniqueKeyCount);
249
250 validCellCount = h_uniqueKeyCount - 1;
251
252 // we still prefix sum the uniqueKeyCount cell-object-pair
253 // because we need to know the total number of collision pairs
254 // which is at the last element of the prefix sum array
255 DeviceScan(m_stream).ExclusiveSum(objCountInCell.data(), // in
256 objCountInCellPrefixSum.data(), // out
257 validCellCount + 1);
258}
259
260template <typename Hash>
261template <typename Pred>
262void SparseSpatialHashImpl<Hash>::simple_setup_collision_pairs(Pred&& pred,
263 DeviceBuffer<CollisionPair>& collisionPairs)
264{
265 if(empty_level)
266 return;
267
268 // using a simple in-thread counting way to get the total number of collision pairs
269 simple_count_collision_pairs(std::forward<Pred>(pred));
270 alloc_collision_pair_list(collisionPairs, sum);
271 simple_fill_collision_pair_list(collisionPairs, std::forward<Pred>(pred));
272}
273
274template <typename Hash>
275template <typename Pred>
276void SparseSpatialHashImpl<Hash>::simple_count_collision_pairs(Pred&& pred)
277{
278 using CallableType = raw_type_t<Pred>;
279
280 using namespace muda;
281
282 muda::ParallelFor(0, m_stream) //
283 .apply(validCellCount,
284 [spheres = spheres.viewer(),
285 objCountInCell = objCountInCell.viewer(),
286 objCountInCellPrefixSum = objCountInCellPrefixSum.viewer(),
287 cellArrayValueSorted = cellArrayValueSorted.viewer(),
288 collisionPairCount = collisionPairCount.viewer(),
289 pred = std::forward<Pred>(pred),
290 level = this->level] __device__(int cell) mutable
291 {
292 int size = objCountInCell(cell);
293 int offset = objCountInCellPrefixSum(cell);
294 int pairCount = 0;
295 // print("cell %d has %d objects\n", cell, size);
296 for(int i = 0; i < size; ++i)
297 {
298 auto cell0 = cellArrayValueSorted(offset + i);
299 auto oid0 = cell0.oid;
300 auto s0 = spheres(oid0);
301 if(s0.level < level)
302 continue;
303 for(int j = i + 1; j < size; ++j)
304 {
305 auto cell1 = cellArrayValueSorted(offset + j);
306 auto oid1 = cell1.oid;
307 auto s1 = spheres(oid1);
308 // print("test => %d,%d\n", oid0, oid1);
309 if(!Cell::allow_ignore(cell0, cell1) && intersect(s0, s1)
310 && pred(oid0, oid1)) // user predicate
311 {
312 //print("pair");
313 ++pairCount;
314 }
315 }
316 }
317 collisionPairCount(cell) = pairCount;
318 })
319 .wait();
320
321 int keycount = uniqueKeyCount;
322 if(keycount)
323 {
324 DeviceScan(m_stream).ExclusiveSum(
325 collisionPairCount.data(), collisionPairPrefixSum.data(), keycount);
326 auto lastOffset = validCellCount;
327
328 BufferLaunch(m_stream) //
329 .copy(&sum, collisionPairPrefixSum.view(lastOffset))
330 .wait();
331 }
332 else
333 sum = 0;
334}
335
336template <typename Hash>
337void SparseSpatialHashImpl<Hash>::alloc_collision_pair_list(DeviceBuffer<CollisionPair>& collisionPairs,
338 int totalCollisionPairCount)
339{
340 pairListOffset = collisionPairs.size();
341 collisionPairs.resize(collisionPairs.size() + totalCollisionPairCount);
342}
343
344template <typename Hash>
345template <typename Pred>
346void SparseSpatialHashImpl<Hash>::simple_fill_collision_pair_list(DeviceBuffer<CollisionPair>& collisionPairs,
347 Pred&& pred)
348{
349 using namespace muda;
350
351 muda::ParallelFor(0, m_stream)
352 .apply(validCellCount,
353 [spheres = spheres.viewer(),
354 objCountInCell = objCountInCell.viewer(),
355 objCountInCellPrefixSum = objCountInCellPrefixSum.viewer(),
356 cellArrayValueSorted = cellArrayValueSorted.viewer(),
357 collisionPairCount = collisionPairCount.viewer(),
358 collisionPairPrefixSum = collisionPairPrefixSum.viewer(),
359 collisionPairs = collisionPairs.view(pairListOffset).viewer(),
360 pred = std::forward<Pred>(pred),
361 level = this->level] __device__(int cell) mutable
362 {
363 int size = objCountInCell(cell);
364 int offset = objCountInCellPrefixSum(cell);
365 int pairOffset = collisionPairPrefixSum(cell);
366 int index = 0;
367 for(int i = 0; i < size; ++i)
368 {
369 auto cell0 = cellArrayValueSorted(offset + i);
370 auto oid0 = cell0.oid;
371 auto s0 = spheres(oid0);
372
373 if(s0.level < level)
374 continue;
375
376 for(int j = i + 1; j < size; ++j)
377 {
378 auto cell1 = cellArrayValueSorted(offset + j);
379 auto oid1 = cell1.oid;
380 auto s1 = spheres(oid1);
381 //print("test => %d,%d\n", oid0, oid1);
382 if(!Cell::allow_ignore(cell0, cell1) // cell0, cell1 are created by test the proxy sphere
383 && intersect(s0, s1) // test the bounding spheres to get exact collision result
384 && pred(oid0, oid1)) // user predicate
385 {
386 CollisionPair p{oid0, oid1};
387 collisionPairs(pairOffset + index) = p;
388 ++index;
389 }
390 }
391 }
392 });
393}
394
395constexpr int ij_to_cell_local_index(int i, int j, int objCount)
396{
397 return (objCount - 1 + objCount - i) * i / 2 + j - i - 1;
398}
399
400template <typename Hash>
401template <typename Pred>
402void SparseSpatialHashImpl<Hash>::balanced_setup_collision_pairs(
403 bool append, DeviceBuffer<CollisionPair>& collisionPairs, Pred&& pred)
404{
405 if(empty_level)
406 return;
407
408 cellToCollisionPairUpperBound.resize(validCellCount + 1);
409 cellToCollisionPairUpperBoundPrefixSum.resize(validCellCount + 1);
410
411 // eg:
412 // objCountInCell = [1, 2, 3]
413 // cellToCollisionPairUpperBound = [1, 4, 9, x]
414 ParallelFor(0, m_stream)
415 .kernel_name("setup_collision_pairs_upper_bound")
416 .apply(validCellCount,
417 [objCountInCell = objCountInCell.viewer(),
418 cellToCollisionPairUpperBound =
419 cellToCollisionPairUpperBound.view(0, validCellCount).viewer()] __device__(int cell) mutable
420 {
421 int size = objCountInCell(cell);
422 cellToCollisionPairUpperBound(cell) = size * (size - 1) / 2 + 1; // +1 for sentinel
423 });
424
425 // e.g.
426 // cellToCollisionPairUpperBound = [1, 4, 9, x]
427 // cellToCollisionPairUpperBoundPrefixSum = [0, 1, 5, 14]
428 DeviceScan(m_stream).ExclusiveSum(cellToCollisionPairUpperBound.data(),
429 cellToCollisionPairUpperBoundPrefixSum.data(),
430 validCellCount + 1);
431
432 // e.g.
433 // count = 14
434 int collisionPairCountUpperBound = 0;
435 BufferLaunch(m_stream).copy<int>(&collisionPairCountUpperBound,
436 cellToCollisionPairUpperBoundPrefixSum.view(validCellCount));
437
438 // e.g.
439 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13
440 // potentialCollisionPairIdToCellIndexBuffer = [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1]
441 BufferLaunch(m_stream)
442 .resize(potentialCollisionPairIdToCellIndexBuffer, collisionPairCountUpperBound)
443 .fill(potentialCollisionPairIdToCellIndexBuffer.view(), 0);
444 ParallelFor(0, m_stream)
445 .kernel_name("fill_last_collision_pair_count")
446 .apply(validCellCount,
447 [objCountInCell = objCountInCell.cviewer(),
448 cellToCollisionPairUpperBound =
449 cellToCollisionPairUpperBound.view(0, validCellCount).viewer(),
450 cellToCollisionPairUpperBoundPrefixSum =
451 cellToCollisionPairUpperBoundPrefixSum.cviewer(),
452 potentialCollisionPairIdToCellIndexBuffer =
453 potentialCollisionPairIdToCellIndexBuffer
454 .view(0, collisionPairCountUpperBound)
455 .viewer()] __device__(int cell) mutable
456 {
457 int size = objCountInCell(cell);
458 MUDA_KERNEL_ASSERT(size > 0,
459 "Fatal Algo Error: objCountInCell(%d)=%d, an empty cell shouldn't be recorded, something goes wrong!",
460 cell,
461 size);
462 int start = cellToCollisionPairUpperBoundPrefixSum(cell);
463 int upper_bound = cellToCollisionPairUpperBound(cell);
464 potentialCollisionPairIdToCellIndexBuffer(start + upper_bound - 1) = 1;
465 });
466
467 BufferLaunch(m_stream).resize(potentialCollisionPairIdToCellIndex,
468 collisionPairCountUpperBound);
469
470 DeviceScan(m_stream).ExclusiveSum(potentialCollisionPairIdToCellIndexBuffer.data(),
471 potentialCollisionPairIdToCellIndex.data(),
472 collisionPairCountUpperBound);
473
474 if(!append)
475 BufferLaunch(m_stream).clear(collisionPairs);
476
477 auto collisionPairOffset = collisionPairs.size();
478
479 BufferLaunch(m_stream)
480 .resize(collisionPairBuffer, collisionPairCountUpperBound)
481 .resize(collisionPairs, collisionPairOffset + collisionPairCountUpperBound);
482
483
484 // e.g.
485 // cellArrayValueSorted = [0, 1, 2, 3, 4, 5]
486 // objCountInCell = [1, 2, 3]
487 // cellToCollisionPairUpperBoundPrefixSum = [0, 1, 5, 14]
488 //
489 // potentialCollisionPairIdToCellIndex = [0, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3]
490 ParallelFor(0, m_stream)
491 .kernel_name("fill_collision_pairs")
492 .apply(
493 collisionPairCountUpperBound,
494 [spheres = spheres.cviewer().name("spheres"),
495 objCountInCell = objCountInCell.cviewer().name("objCountInCell"),
496 cellOffsets = objCountInCellPrefixSum.cviewer().name("cellOffsets"),
497 cellToCollisionPairUpperBound =
498 cellToCollisionPairUpperBound.cviewer().name("cellToCollisionPairUpperBound"),
499 cellArrayValueSorted = cellArrayValueSorted.cviewer().name("cellArrayValueSorted"),
500 cellToCollisionPairUpperBoundPrefixSum =
501 cellToCollisionPairUpperBoundPrefixSum.cviewer().name("cellToCollisionPairUpperBoundPrefixSum"),
502 potentialCollisionPairIdToCellIndex =
503 potentialCollisionPairIdToCellIndex.cviewer().name("potentialCollisionPairIdToCellIndex"),
504 potentialCollisionPairIdToCellIndexBuffer =
505 potentialCollisionPairIdToCellIndexBuffer.cviewer().name("potentialCollisionPairIdToCellIndexBuffer"),
506 collisionPairBuffer = collisionPairBuffer.viewer().name("collisionPairBuffer"),
507 pred = std::forward<Pred>(pred),
508 level = this->level] __device__(int cpI) mutable
509 {
510 int cellIndex = potentialCollisionPairIdToCellIndex(cpI);
511
512 // int upper_bound = cellToCollisionPairUpperBound(cellIndex); // check the index is valid
513
514 int start = cellToCollisionPairUpperBoundPrefixSum(cellIndex);
515
516 int objCount = objCountInCell(cellIndex);
517
518 int cellOffset = cellOffsets(cellIndex);
519
520 int cellLocalIndex = cpI - start;
521
522 collisionPairBuffer(cpI) = CollisionPair::invalid();
523
524 if(cellLocalIndex == 0) // ignore the first sentinel
525 return;
526
527 cellLocalIndex -= 1;
528
529 // use the formula to get the i, j
530
531 int i = objCount - 2
532 - floor(sqrt(-8.0 * cellLocalIndex + 4 * objCount * (objCount - 1) - 7) / 2.0
533 - 0.5);
534 int j = cellLocalIndex + i + 1 - objCount * (objCount - 1) / 2
535 + (objCount - i) * ((objCount - i) - 1) / 2;
536
537 // printf("CellLocalId=%d, i=%d, j=%d,objCount=%d\n", cellLocalIndex, i, j, objCount);
538
539 MUDA_KERNEL_ASSERT(
540 i >= 0 && j >= 0 && i < objCount && j < objCount, "i=%d, j=%d", i, j);
541
542 MUDA_KERNEL_ASSERT(ij_to_cell_local_index(i, j, objCount) == cellLocalIndex,
543 "numerical error happen!"
544 "i=%d, j=%d, objCount=%d, cellLocalIndex=%d",
545 i,
546 j,
547 objCount,
548 cellLocalIndex);
549
550 Cell cell0 = cellArrayValueSorted(cellOffset + i);
551 Cell cell1 = cellArrayValueSorted(cellOffset + j);
552
553 int oid0 = cell0.oid;
554 int oid1 = cell1.oid;
555
556 // print("cellOffset=%d, ij=(%d,%d), oid=(%d,%d)\n", cellOffset, i, j, oid0, oid1);
557
558 auto s0 = spheres(oid0);
559 auto s1 = spheres(oid1);
560
561 if(s0.level > level || s1.level > level)
562 return;
563
564 if(s0.level < level && s1.level < level)
565 return;
566
567 if(!Cell::allow_ignore(cell0, cell1) // cell0, cell1 are created by test the proxy sphere
568 && intersect(s0, s1) // test the bounding spheres to get exact collision result
569 && pred(oid0, oid1)) // user predicate
570 {
571 collisionPairBuffer(cpI) = CollisionPair{oid0, oid1};
572 }
573 });
574
575 // select the valid collision pairs
576 DeviceSelect(m_stream).If(collisionPairBuffer.data(),
577 collisionPairs.view(collisionPairOffset).data(),
578 validCollisionPairCount.data(),
579 collisionPairCountUpperBound,
580 [] CUB_RUNTIME_FUNCTION(const CollisionPair& p) -> bool
581 { return p.is_valid(); });
582
583 BufferLaunch(m_stream).resize(collisionPairs, collisionPairOffset + validCollisionPairCount);
584}
585} // namespace muda::spatial_hash::details
Definition buffer_launch.h:13
A std::vector like wrapper of cuda device memory, allows user to:
Definition device_buffer.h:46
Definition device_radix_sort.h:18
Definition device_run_length_encode.h:12
Definition device_scan.h:21
Definition device_select.h:12
A wrapper of raw cuda kernel launch in muda style, removing the <<<>>> usage, for better intellisense...
Definition launch.h:86
a frequently used parallel for loop, DynamicBlockDim and GridStrideLoop strategy are provided,...
Definition parallel_for.h:116