diff --git a/slangpy/tests/slangpy_tests/test_buffer_views.py b/slangpy/tests/slangpy_tests/test_buffer_views.py index f634afc9..7835821b 100644 --- a/slangpy/tests/slangpy_tests/test_buffer_views.py +++ b/slangpy/tests/slangpy_tests/test_buffer_views.py @@ -108,7 +108,7 @@ def test_to_numpy( strides = Shape(unravelled_shape).calc_contiguous_strides() byte_strides = tuple(s * np_dtype.itemsize for s in strides) - ndarray = buffer.to_numpy() + ndarray = np.ascontiguousarray(buffer.to_numpy()) assert ndarray.shape == unravelled_shape assert ndarray.strides == byte_strides assert ndarray.dtype == np_dtype diff --git a/src/slangpy_ext/utils/slangpystridedbufferview.cpp b/src/slangpy_ext/utils/slangpystridedbufferview.cpp index a833415f..dc34e411 100644 --- a/src/slangpy_ext/utils/slangpystridedbufferview.cpp +++ b/src/slangpy_ext/utils/slangpystridedbufferview.cpp @@ -1,12 +1,15 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include #include +#include #include "nanobind.h" #include "sgl/device/device.h" #include "sgl/device/command.h" #include "sgl/device/buffer_cursor.h" +#include "sgl/device/reflection.h" #include "utils/slangpybuffer.h" namespace sgl::slangpy { @@ -64,6 +67,21 @@ ref innermost_type(ref type) return result; } +std::vector> type_stack(ref type) +{ + std::vector> res; + ref curr = type; + while (true) { + res.push_back(curr); + ref child = curr->element_type(); + if (!child || child == curr) { + break; + } + curr = child; + } + return res; +} + StridedBufferView::StridedBufferView(Device* device, const StridedBufferViewDesc& desc, ref storage) { if (!storage) { @@ -324,7 +342,6 @@ static nb::ndarray to_ndarray(void* data, nb::handle owner, const Str // Buffer with shape (5, ) of struct Foo { ... } -> ndarray of shape (5, sizeof(Foo)) and dtype uint8 bool is_scalar = innermost_layout->type()->kind() == TypeReflection::Kind::scalar; auto dtype_shape = desc.dtype->get_shape(); - auto dtype_strides = dtype_shape.calc_contiguous_strides(); size_t innermost_size = is_scalar ? innermost_layout->stride() : 1; TypeReflection::ScalarType scalar_type @@ -339,9 +356,13 @@ static nb::ndarray to_ndarray(void* data, nb::handle owner, const Str sizes.push_back(desc.shape[i]); strides.push_back(desc.strides[i] * dtype_size / innermost_size); } + // Use cursor reflection to calculate dtype stride. + ref curr_type = desc.dtype; for (size_t i = 0; i < dtype_shape.size(); ++i) { sizes.push_back(dtype_shape[i]); - strides.push_back(dtype_strides[i]); + curr_type = curr_type->element_type(); + auto dtype_stride = curr_type->buffer_type_layout()->stride() / innermost_size; + strides.push_back(dtype_stride); } // If the innermost dtype is not a scalar, add one innermost dimension over // the bytes of the element @@ -388,6 +409,8 @@ nb::ndarray StridedBufferView::to_torch() const void StridedBufferView::copy_from_numpy(nb::ndarray data) { + // StridedBufferView::is_contiguous() == true does not necessarily means the internal buffer is continuous in memory + // (4 element alignment requirement on metal will break this continuity). SGL_CHECK(is_ndarray_contiguous(data), "Source Numpy array must be contiguous"); SGL_CHECK(is_contiguous(), "Destination buffer view must be contiguous"); @@ -397,7 +420,39 @@ void StridedBufferView::copy_from_numpy(nb::ndarray data) size_t buffer_size = m_storage->size() - byte_offset; SGL_CHECK(data_size <= buffer_size, "Numpy array is larger than the buffer ({} > {})", data_size, buffer_size); - m_storage->set_data(data.data(), data_size, byte_offset); + // At this point, the only possible way to break stride in a contiguous buffer is metal buffer alignment in the + // second last dimension. (matrix or vector) + auto kind = desc().dtype->buffer_type_layout()->kind(); + if (kind != TypeReflection::Kind::vector && kind != TypeReflection::Kind::matrix) { + m_storage->set_data(data.data(), data_size, byte_offset); + return; + } + // Get dlpack type from scalar type. + auto stack = type_stack(desc().dtype); + ref innermost = stack[stack.size() - 1]; + ref innermost_layout = innermost->buffer_type_layout(); + size_t innermost_size = innermost_layout->stride(); + ref second_innermost = stack[stack.size() - 2]; + ref second_innermost_layout = second_innermost->buffer_type_layout(); + size_t second_innermost_size = second_innermost_layout->stride(); + // Alignment fits. + if (second_innermost_size == second_innermost_layout->type()->col_count() * innermost_size) { + m_storage->set_data(data.data(), data_size, byte_offset); + return; + } + + // Copy with local buffer. + std::vector buffer(buffer_size); + auto actual_size = second_innermost_layout->type()->col_count() * innermost_size; + // Write element. + for (size_t i = 0; i < data_size / actual_size; i++) { + std::memcpy( + buffer.data() + i * second_innermost_size, + static_cast(data.data()) + i * actual_size, + actual_size + ); + } + m_storage->set_data(buffer.data(), buffer.size(), byte_offset); } } // namespace sgl::slangpy