Skip to content

File device_segmented_reduce.h

File List > cub > device > device_segmented_reduce.h

Go to the documentation of this file

#pragma once
#include <muda/cub/device/cub_wrapper.h>
#include "details/cub_wrapper_macro_def.inl"
#ifndef __INTELLISENSE__
#include <cub/device/device_segmented_reduce.cuh>
#endif

namespace muda
{
//ref: https://nvlabs.github.io/cub/structcub_1_1_device_reduce.html
class DeviceSegmentedReduce : public CubWrapper<DeviceSegmentedReduce>
{
    using Base = CubWrapper<DeviceSegmentedReduce>;

  public:
    using Base::Base;

    template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOp, typename T>
    DeviceSegmentedReduce& Reduce(InputIteratorT       d_in,
                                  OutputIteratorT      d_out,
                                  int                  num_segments,
                                  BeginOffsetIteratorT d_begin_offsets,
                                  EndOffsetIteratorT   d_end_offsets,
                                  ReductionOp          reduction_op,
                                  T                    initial_value)
    {


        MUDA_CUB_WRAPPER_IMPL(cub::DeviceSegmentedReduce::Reduce(d_temp_storage,
                                                                 temp_storage_bytes,
                                                                 d_in,
                                                                 d_out,
                                                                 num_segments,
                                                                 d_begin_offsets,
                                                                 d_end_offsets,
                                                                 reduction_op,
                                                                 initial_value,
                                                                 _stream,
                                                                 false));
    }

    template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    DeviceSegmentedReduce& Sum(InputIteratorT       d_in,
                               OutputIteratorT      d_out,
                               int                  num_segments,
                               BeginOffsetIteratorT d_begin_offsets,
                               EndOffsetIteratorT   d_end_offsets)
    {
        MUDA_CUB_WRAPPER_IMPL(cub::DeviceSegmentedReduce::Sum(
            d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, _stream, false));
    }

    template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    DeviceSegmentedReduce& Min(InputIteratorT       d_in,
                               OutputIteratorT      d_out,
                               int                  num_segments,
                               BeginOffsetIteratorT d_begin_offsets,
                               EndOffsetIteratorT   d_end_offsets)
    {
        MUDA_CUB_WRAPPER_IMPL(cub::DeviceSegmentedReduce::Min(
            d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, _stream, false));
    }

    template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    DeviceSegmentedReduce& ArgMin(InputIteratorT       d_in,
                                  OutputIteratorT      d_out,
                                  int                  num_segments,
                                  BeginOffsetIteratorT d_begin_offsets,
                                  EndOffsetIteratorT   d_end_offsets)
    {
        MUDA_CUB_WRAPPER_IMPL(cub::DeviceSegmentedReduce::ArgMin(
            d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, _stream, false));
    }

    template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    DeviceSegmentedReduce& Max(InputIteratorT       d_in,
                               OutputIteratorT      d_out,
                               int                  num_segments,
                               BeginOffsetIteratorT d_begin_offsets,
                               EndOffsetIteratorT   d_end_offsets)
    {
        MUDA_CUB_WRAPPER_IMPL(cub::DeviceSegmentedReduce::Max(
            d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, _stream, false));
    }

    template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    DeviceSegmentedReduce& ArgMax(InputIteratorT       d_in,
                                  OutputIteratorT      d_out,
                                  int                  num_segments,
                                  BeginOffsetIteratorT d_begin_offsets,
                                  EndOffsetIteratorT   d_end_offsets)
    {
        MUDA_CUB_WRAPPER_IMPL(cub::DeviceSegmentedReduce::ArgMax(
            d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, _stream, false));
    }

    // Origin:

    template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOp, typename T>
    DeviceSegmentedReduce& Reduce(void*                d_temp_storage,
                                  size_t&              temp_storage_bytes,
                                  InputIteratorT       d_in,
                                  OutputIteratorT      d_out,
                                  int                  num_segments,
                                  BeginOffsetIteratorT d_begin_offsets,
                                  EndOffsetIteratorT   d_end_offsets,
                                  ReductionOp          reduction_op,
                                  T                    initial_value)
    {
        MUDA_CUB_WRAPPER_FOR_COMPUTE_GRAPH_IMPL(
            cub::DeviceSegmentedReduce::Reduce(d_temp_storage,
                                               temp_storage_bytes,
                                               d_in,
                                               d_out,
                                               num_segments,
                                               d_begin_offsets,
                                               d_end_offsets,
                                               reduction_op,
                                               initial_value,
                                               _stream,
                                               false));
    }

    template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    DeviceSegmentedReduce& Sum(void*                d_temp_storage,
                               size_t&              temp_storage_bytes,
                               InputIteratorT       d_in,
                               OutputIteratorT      d_out,
                               int                  num_segments,
                               BeginOffsetIteratorT d_begin_offsets,
                               EndOffsetIteratorT   d_end_offsets)
    {
        MUDA_CUB_WRAPPER_FOR_COMPUTE_GRAPH_IMPL(cub::DeviceSegmentedReduce::Sum(
            d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, _stream, false));
    }

    template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    DeviceSegmentedReduce& Min(void*                d_temp_storage,
                               size_t&              temp_storage_bytes,
                               InputIteratorT       d_in,
                               OutputIteratorT      d_out,
                               int                  num_segments,
                               BeginOffsetIteratorT d_begin_offsets,
                               EndOffsetIteratorT   d_end_offsets)
    {
        MUDA_CUB_WRAPPER_FOR_COMPUTE_GRAPH_IMPL(cub::DeviceSegmentedReduce::Min(
            d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, _stream, false));
    }

    template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    DeviceSegmentedReduce& ArgMin(void*                d_temp_storage,
                                  size_t&              temp_storage_bytes,
                                  InputIteratorT       d_in,
                                  OutputIteratorT      d_out,
                                  int                  num_segments,
                                  BeginOffsetIteratorT d_begin_offsets,
                                  EndOffsetIteratorT   d_end_offsets)
    {
        MUDA_CUB_WRAPPER_FOR_COMPUTE_GRAPH_IMPL(cub::DeviceSegmentedReduce::ArgMin(
            d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, _stream, false));
    }

    template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    DeviceSegmentedReduce& Max(void*                d_temp_storage,
                               size_t&              temp_storage_bytes,
                               InputIteratorT       d_in,
                               OutputIteratorT      d_out,
                               int                  num_segments,
                               BeginOffsetIteratorT d_begin_offsets,
                               EndOffsetIteratorT   d_end_offsets)
    {
        MUDA_CUB_WRAPPER_FOR_COMPUTE_GRAPH_IMPL(cub::DeviceSegmentedReduce::Max(
            d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, _stream, false));
    }


    template <typename InputIteratorT, typename OutputIteratorT>
    DeviceSegmentedReduce& ArgMax(void*           d_temp_storage,
                                  size_t&         temp_storage_bytes,
                                  InputIteratorT  d_in,
                                  OutputIteratorT d_out,
                                  int             num_items)
    {
        MUDA_CUB_WRAPPER_FOR_COMPUTE_GRAPH_IMPL(cub::DeviceSegmentedReduce::ArgMax(
            d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, _stream, false));
    }

    template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    DeviceSegmentedReduce& ArgMax(void*                d_temp_storage,
                                  size_t&              temp_storage_bytes,
                                  InputIteratorT       d_in,
                                  OutputIteratorT      d_out,
                                  int                  num_segments,
                                  BeginOffsetIteratorT d_begin_offsets,
                                  EndOffsetIteratorT   d_end_offsets)
    {
        MUDA_CUB_WRAPPER_FOR_COMPUTE_GRAPH_IMPL(cub::DeviceSegmentedReduce::ArgMax(
            d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, _stream, false));
    }
};
}  // namespace muda

#include "details/cub_wrapper_macro_undef.inl"