Feat: Add tensor reduction

frjnn · frjnn · commit b40e46100543 · 2022-04-08T15:05:28.000+02:00
diff --git a/crates/cudnn/README.md b/crates/cudnn/README.md
@@ -69,7 +69,7 @@ The previous tensor descriptor can be used together with a `i8` device buffer an
 
 Currently this crate does not support `f16` and `bf16` data types.
 
-### Tensor formats
+### cuDNN tensor formats
 
 We decided not to check tensor format configurations at compile time, since it is too strong of a requirement. As a consequence, should you mess up, the program will fail at run-time. A proper understanding of the cuDNN API mechanics is thus fundamental to properly use this crate. 
 
diff --git a/crates/cudnn/src/activation/activation_mode.rs b/crates/cudnn/src/activation/activation_mode.rs
@@ -4,6 +4,7 @@ use crate::sys;
 ///
 /// cuDNN [docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnActivationMode_t)
 /// may offer additional information about the APi behavior.
+#[non_exhaustive]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum ActivationMode {
     /// Selects the sigmoid function.
diff --git a/crates/cudnn/src/lib.rs b/crates/cudnn/src/lib.rs
@@ -13,6 +13,7 @@ mod math_type;
 mod nan_propagation;
 mod op;
 mod pooling;
+mod reduction;
 mod rnn;
 mod softmax;
 mod sys;
@@ -31,6 +32,7 @@ pub use math_type::*;
 pub use nan_propagation::*;
 pub use op::*;
 pub use pooling::*;
+pub use reduction::*;
 pub use rnn::*;
 pub use softmax::*;
 pub use tensor::*;
diff --git a/crates/cudnn/src/reduction/indices_type.rs b/crates/cudnn/src/reduction/indices_type.rs
@@ -0,0 +1,22 @@
+use crate::sys;
+
+/// Indicates the data type of the indices computed by a reduction operation.
+#[non_exhaustive]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum IndicesType {
+    U8,
+    U16,
+    U32,
+    U64,
+}
+
+impl From<IndicesType> for sys::cudnnIndicesType_t {
+    fn from(mode: IndicesType) -> Self {
+        match mode {
+            IndicesType::U8 => Self::CUDNN_8BIT_INDICES,
+            IndicesType::U16 => Self::CUDNN_16BIT_INDICES,
+            IndicesType::U32 => Self::CUDNN_32BIT_INDICES,
+            IndicesType::U64 => Self::CUDNN_64BIT_INDICES,
+        }
+    }
+}
diff --git a/crates/cudnn/src/reduction/mod.rs b/crates/cudnn/src/reduction/mod.rs
@@ -0,0 +1,223 @@
+mod indices_type;
+mod reduce_indices;
+mod reduce_op;
+mod reduction_descriptor;
+
+pub use indices_type::*;
+pub use reduce_indices::*;
+pub use reduce_op::*;
+pub use reduction_descriptor::*;
+
+use std::mem::MaybeUninit;
+
+use cust::memory::GpuBuffer;
+
+use crate::{
+    sys, CudnnContext, CudnnError, DataType, IntoResult, ScalingDataType, TensorDescriptor,
+};
+
+impl CudnnContext {
+    /// Returns the minimum size of the workspace to be passed to the reduction given the input and
+    /// output tensors.
+    ///
+    /// # Arguments
+    ///
+    /// * `desc` - reduction descriptor.
+    ///
+    /// * `a_desc` - input tensor descriptor.
+    ///
+    /// * `c_desc` - output tensor descriptor.
+    pub fn get_reduction_workspace_size<T, U, V>(
+        &self,
+        desc: &ReductionDescriptor<T>,
+        a_desc: &TensorDescriptor<U>,
+        c_desc: &TensorDescriptor<V>,
+    ) -> Result<usize, CudnnError>
+    where
+        T: DataType,
+        U: DataType,
+        V: DataType,
+    {
+        let mut size = MaybeUninit::uninit();
+
+        unsafe {
+            sys::cudnnGetReductionWorkspaceSize(
+                self.raw,
+                desc.raw,
+                a_desc.raw,
+                c_desc.raw,
+                size.as_mut_ptr(),
+            )
+            .into_result()?;
+
+            Ok(size.assume_init())
+        }
+    }
+
+    /// Returns the minimum size of the index space to be passed to the reduction given the input
+    /// and output tensors.
+    ///
+    /// # Arguments
+    ///
+    /// * `desc` - reduction descriptor.
+    ///
+    /// * `a_desc` - input tensor descriptor.
+    ///
+    /// * `c_desc` - output tensor descriptor.
+    pub fn get_reduction_indices_size<T, U, V>(
+        &self,
+        desc: &ReductionDescriptor<T>,
+        a_desc: &TensorDescriptor<U>,
+        c_desc: &TensorDescriptor<V>,
+    ) -> Result<usize, CudnnError>
+    where
+        T: DataType,
+        U: DataType,
+        V: DataType,
+    {
+        let mut size = MaybeUninit::uninit();
+
+        unsafe {
+            sys::cudnnGetReductionIndicesSize(
+                self.raw,
+                desc.raw,
+                a_desc.raw,
+                c_desc.raw,
+                size.as_mut_ptr(),
+            )
+            .into_result()?;
+
+            Ok(size.assume_init())
+        }
+    }
+
+    /// This function reduces tensor `a` by implementing the equation:
+    ///
+    /// C = alpha * reduce op ( A ) + gamma * C
+    ///
+    /// given tensors `a` and `c` and scaling factors `alpha` and `gamma`.
+    /// Each dimension of the output tensor c must match the corresponding dimension of the
+    /// input tensor a or must be equal to 1.
+    ///
+    /// The dimensions equal to 1 indicate the dimensions of a to be reduced.
+    ///
+    /// **Do note** that currently only the 32-bit indices type is supported and that the data types
+    /// of the tensors A and C must match if of type double. In this case, alpha and gamma and are all
+    /// assumed to be of type double.
+    ///
+    /// # Arguments
+    ///
+    /// * `desc` - tensor reduction descriptor.
+    ///
+    /// * `indices` - indices buffer in device memory.
+    ///
+    /// * `workspace` - workspace for the reduction operation.
+    ///
+    /// * `alpha` - scaling factor for the input tensor.
+    ///
+    /// * `a_desc` - tensor descriptor for the input tensor.
+    ///
+    /// * `a` - input tensor in device memory.
+    ///
+    /// * `gamma` -  scaling factor for the output tensor.
+    ///
+    /// * `c_desc` - tensor descriptor for the output tensor.
+    ///
+    /// * `c` - output tensor in device memory.
+    ///
+    /// # Errors
+    ///
+    /// Returns errors if an unsupported configuration of arguments is detected.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use std::error::Error;
+    /// #
+    /// # fn main() -> Result<(), Box<dyn Error>> {
+    /// use cudnn::{CudnnContext, NanPropagation, ReduceOp, ReduceIndices, ReductionDescriptor, TensorDescriptor};
+    /// use cust::memory::DeviceBuffer;
+    ///
+    /// let ctx = CudnnContext::new()?;
+    ///
+    /// let op = ReduceOp::Add;
+    /// let nan_policy = NanPropagation::PropagateNaN;
+    /// let indices = ReduceIndices::None;
+    /// let indices_type = None;
+    ///
+    /// let desc = ReductionDescriptor::<f32>::new(op, nan_policy, indices, indices_type)?;
+    ///
+    /// let alpha = 1.0;
+    /// let a_desc = TensorDescriptor::<i8>::new_strides(&[1, 1, 1, 5], &[5, 5, 5, 1])?;
+    /// let a = DeviceBuffer::<i8>::from_slice(&[4, 4, 4, 4, 4])?;
+    ///
+    /// let gamma = 1.0;
+    /// let c_desc = TensorDescriptor::<i8>::new_strides(&[1, 1, 1, 1], &[1, 1, 1, 1])?;
+    /// let mut c = DeviceBuffer::<i8>::from_slice(&[0])?;
+    ///
+    /// let workspace_size = ctx.get_reduction_workspace_size(&desc, &a_desc, &c_desc)?;
+    /// let mut workspace = unsafe { DeviceBuffer::uninitialized(workspace_size)? };
+    ///
+    /// let indices: Option<&mut DeviceBuffer<u8>> = None;
+    ///
+    /// ctx.reduce(&desc, indices, &mut workspace, alpha, &a_desc, &a, gamma, &c_desc, &mut c)?;
+    ///
+    /// let c_host = c.as_host_vec()?;
+    ///
+    /// assert!(c_host.iter().all(|x| *x == 20));
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn reduce<CompT, U, V>(
+        &self,
+        desc: &ReductionDescriptor<CompT>,
+        indices: Option<&mut impl GpuBuffer<u8>>,
+        workspace: &mut impl GpuBuffer<u8>,
+        alpha: CompT,
+        a_desc: &TensorDescriptor<U>,
+        a: &impl GpuBuffer<U>,
+        gamma: CompT,
+        c_desc: &TensorDescriptor<V>,
+        c: &mut impl GpuBuffer<V>,
+    ) -> Result<(), CudnnError>
+    where
+        CompT: ScalingDataType<U>,
+        U: DataType,
+        V: DataType,
+    {
+        let (indices_ptr, indices_size) = {
+            indices.map_or((std::ptr::null_mut(), 0), |indices| {
+                (indices.as_device_ptr().as_mut_ptr() as _, indices.len())
+            })
+        };
+
+        let workspace_ptr = workspace.as_device_ptr().as_mut_ptr() as _;
+        let workspace_size = workspace.len();
+
+        let a_data = a.as_device_ptr().as_ptr() as _;
+        let c_data = c.as_device_ptr().as_mut_ptr() as _;
+
+        let alpha = &alpha as *const CompT as _;
+        let gamma = &gamma as *const CompT as _;
+
+        unsafe {
+            sys::cudnnReduceTensor(
+                self.raw,
+                desc.raw,
+                indices_ptr,
+                indices_size,
+                workspace_ptr,
+                workspace_size,
+                alpha,
+                a_desc.raw,
+                a_data,
+                gamma,
+                c_desc.raw,
+                c_data,
+            )
+            .into_result()?;
+        }
+
+        Ok(())
+    }
+}
diff --git a/crates/cudnn/src/reduction/reduce_indices.rs b/crates/cudnn/src/reduction/reduce_indices.rs
@@ -0,0 +1,21 @@
+use crate::sys;
+
+/// Indicates whether a reduction operation should compute indices or not.
+#[non_exhaustive]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum ReduceIndices {
+    /// Do not compute indices.
+    None,
+    /// Compute indices. The resulting indices are relative to the dimensions being reduced, and
+    /// flattened.
+    Flattened,
+}
+
+impl From<ReduceIndices> for sys::cudnnReduceTensorIndices_t {
+    fn from(mode: ReduceIndices) -> Self {
+        match mode {
+            ReduceIndices::None => Self::CUDNN_REDUCE_TENSOR_NO_INDICES,
+            ReduceIndices::Flattened => Self::CUDNN_REDUCE_TENSOR_FLATTENED_INDICES,
+        }
+    }
+}
diff --git a/crates/cudnn/src/reduction/reduce_op.rs b/crates/cudnn/src/reduction/reduce_op.rs
@@ -0,0 +1,32 @@
+use crate::sys;
+
+/// Tensor reduction operation.
+#[non_exhaustive]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum ReduceOp {
+    Add,
+    Mul,
+    Min,
+    Max,
+    Amax,
+    Avg,
+    Norm1,
+    Norm2,
+    MulNoZeros,
+}
+
+impl From<ReduceOp> for sys::cudnnReduceTensorOp_t {
+    fn from(op: ReduceOp) -> Self {
+        match op {
+            ReduceOp::Add => Self::CUDNN_REDUCE_TENSOR_ADD,
+            ReduceOp::Mul => Self::CUDNN_REDUCE_TENSOR_MUL,
+            ReduceOp::Min => Self::CUDNN_REDUCE_TENSOR_MIN,
+            ReduceOp::Max => Self::CUDNN_REDUCE_TENSOR_MAX,
+            ReduceOp::Amax => Self::CUDNN_REDUCE_TENSOR_AMAX,
+            ReduceOp::Avg => Self::CUDNN_REDUCE_TENSOR_AVG,
+            ReduceOp::Norm1 => Self::CUDNN_REDUCE_TENSOR_NORM1,
+            ReduceOp::Norm2 => Self::CUDNN_REDUCE_TENSOR_NORM2,
+            ReduceOp::MulNoZeros => Self::CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS,
+        }
+    }
+}
diff --git a/crates/cudnn/src/reduction/reduction_descriptor.rs b/crates/cudnn/src/reduction/reduction_descriptor.rs