Skip to content

Commit 33e46e2

Browse files
Address issues related to bytes vs. count
1 parent 2886c02 commit 33e46e2

9 files changed

Lines changed: 218 additions & 39 deletions

File tree

examples/cookbook/recipe_device_ipc.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ int main(int argc, char** argv)
4242

4343
// Allocate device memory - only rank 0 will physically allocate
4444
// All other ranks will import via IPC
45-
const size_t size = 1024 * sizeof(float);
45+
constexpr std::size_t num_elements = 1024;
46+
const size_t size = num_elements * sizeof(float);
4647
float* data = static_cast<float*>(ipc_allocator.allocate(size));
4748

4849
std::cout << "Rank " << rank << ": Got device memory at " << data << std::endl;
@@ -60,7 +61,7 @@ int main(int argc, char** argv)
6061
}
6162

6263
// Copy to device
63-
umpire::copy(host_data, data, size);
64+
umpire::copy(host_data, data, num_elements);
6465
host_allocator.deallocate(host_data);
6566
}
6667

@@ -77,7 +78,7 @@ int main(int argc, char** argv)
7778
// All ranks can now access the data
7879
// Verify by copying a portion back to host
7980
float* value = static_cast<float*>(host_allocator.allocate(sizeof(float)));
80-
umpire::copy(data + 1, value, sizeof(float));
81+
umpire::copy(data + 1, value, 1);
8182

8283
std::cout << "Rank " << rank << ": second value is " << *value << std::endl;
8384

examples/multi_device.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ int main(int, char**)
6868
}
6969
#endif
7070

71-
umpire::copy(a, b, NUM_THREADS * sizeof(double));
71+
umpire::copy(a, b, NUM_THREADS);
7272
b = static_cast<double*>(rm.move(b, rm.getAllocator("HOST")));
7373

7474
UMPIRE_ASSERT(b[BLOCK_SIZE] == (BLOCK_SIZE * MULTIPLE) && "Error: incorrect value!");

examples/tutorial/tut_copy.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ void copy_data(double* source_data, std::size_t size, const std::string& destina
1616
double* dest_data = static_cast<double*>(dest_allocator.allocate(size * sizeof(double)));
1717

1818
// _sphinx_tag_tut_copy_start
19-
umpire::copy(source_data, dest_data, size * sizeof(double));
19+
umpire::copy(source_data, dest_data, size);
2020
// _sphinx_tag_tut_copy_end
2121

2222
std::cout << "Copied source data (" << source_data << ") to destination " << destination << " (" << dest_data << ")"

examples/tutorial/tut_reallocate.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ int main(int, char**)
4141
std::cout << "Reallocating data (" << data << ") to size " << REALLOCATED_SIZE << "...";
4242

4343
// _sphinx_tag_tut_realloc_start
44-
data = static_cast<double*>(umpire::reallocate(&data, REALLOCATED_SIZE * sizeof(double)));
44+
data = umpire::reallocate(&data, REALLOCATED_SIZE);
4545
// _sphinx_tag_tut_realloc_end
4646

4747
std::cout << "done. Reallocated data (" << data << ")" << std::endl;

include/umpire/op/dispatch.hpp

Lines changed: 57 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -289,9 +289,39 @@ struct op_caller {
289289

290290
} // namespace op
291291

292+
//------------------------------------------------------------------------------
293+
// Public API Semantics for Copy and Reallocate Operations
294+
//------------------------------------------------------------------------------
295+
//
296+
// The following functions use different semantics for size parameters based on
297+
// pointer type, matching standard C++ conventions:
298+
//
299+
// COPY OPERATIONS:
300+
// - umpire::copy(T* src, T* dst, std::size_t len) for non-void T:
301+
// len is a COUNT OF ELEMENTS (will be multiplied by sizeof(T) internally)
302+
//
303+
// - umpire::copy(void* src, void* dst, std::size_t len):
304+
// len is BYTES (used directly, no sizeof multiplication)
305+
//
306+
// REALLOCATE OPERATIONS:
307+
// - umpire::reallocate(T** ptr, std::size_t new_size) for non-void T:
308+
// new_size is a COUNT OF ELEMENTS (will be multiplied by sizeof(T) internally)
309+
//
310+
// - umpire::reallocate(void** ptr, std::size_t new_size):
311+
// new_size is BYTES (used directly, no sizeof multiplication)
312+
//
313+
// RATIONALE:
314+
// This matches how detail::get_size<T>(count) works:
315+
// - For void: returns count as-is (bytes)
316+
// - For typed pointers: returns count * sizeof(T) (elements to bytes)
317+
//
318+
// This keeps the API consistent with C++ idioms where typed operations work
319+
// with element counts and void* operations work with byte counts.
320+
//------------------------------------------------------------------------------
321+
292322
// Global operation implementations that use the op_caller
293323
template <typename T>
294-
auto copy(T* src, T* dst, std::size_t len)
324+
void copy(T* src, T* dst, std::size_t len)
295325
{
296326
op::op_caller<op::copy>::exec(src, dst, len);
297327
}
@@ -318,6 +348,10 @@ camp::resources::EventProxy<camp::resources::Resource> memset(T* src, int v, std
318348
template <typename T>
319349
inline T* reallocate(T** src, std::size_t size)
320350
{
351+
// Handle nullptr specially - op_caller can't look up null in allocation map
352+
if (*src == nullptr) {
353+
return op::reallocate<resource::host_platform>::exec(src, size);
354+
}
321355
return op::op_caller<op::reallocate>::exec(src, size);
322356
}
323357

@@ -326,6 +360,10 @@ template <typename T>
326360
inline camp::resources::EventProxy<camp::resources::Resource> reallocate(T** src, std::size_t size,
327361
camp::resources::Resource& ctx)
328362
{
363+
// Handle nullptr specially - op_caller can't look up null in allocation map
364+
if (*src == nullptr) {
365+
return op::reallocate<resource::host_platform>::exec(src, size, ctx);
366+
}
329367
return op::op_caller<op::reallocate>::exec(src, size, ctx);
330368
}
331369

@@ -444,11 +482,12 @@ T* op::reallocate<Src>::exec(T** ptr, std::size_t new_size)
444482
// Allocate new memory
445483
T* new_ptr = static_cast<T*>(allocator.allocate(new_bytes));
446484

447-
// Calculate copy size (minimum of old and new size)
448-
std::size_t copy_size = (old_bytes > new_bytes) ? new_bytes : old_bytes;
485+
// Calculate copy size in bytes (minimum of old and new size)
486+
std::size_t copy_bytes = (old_bytes > new_bytes) ? new_bytes : old_bytes;
449487

450-
// Copy data from old to new location using auto-dispatch copy
451-
umpire::copy(current_ptr, new_ptr, copy_size);
488+
// Copy data using void* to pass bytes directly (avoids sizeof(T) multiplication in copy)
489+
// Note: We cast to void* so that detail::get_size<void>(len) returns len as-is (bytes)
490+
umpire::copy(static_cast<void*>(current_ptr), static_cast<void*>(new_ptr), copy_bytes);
452491

453492
// Deallocate old memory
454493
allocator.deallocate(current_ptr);
@@ -507,11 +546,12 @@ camp::resources::EventProxy<camp::resources::Resource> op::reallocate<Src>::exec
507546
// Allocate new memory
508547
T* new_ptr = static_cast<T*>(allocator.allocate(new_bytes));
509548

510-
// Calculate copy size (minimum of old and new size)
511-
std::size_t copy_size = (old_bytes > new_bytes) ? new_bytes : old_bytes;
549+
// Calculate copy size in bytes (minimum of old and new size)
550+
std::size_t copy_bytes = (old_bytes > new_bytes) ? new_bytes : old_bytes;
512551

513-
// Copy data from old to new location asynchronously using auto-dispatch copy
514-
auto event = umpire::copy(current_ptr, new_ptr, copy_size, ctx);
552+
// Copy data using void* to pass bytes directly (avoids sizeof(T) multiplication in copy)
553+
// Note: We cast to void* so that detail::get_size<void>(len) returns len as-is (bytes)
554+
auto event = umpire::copy(static_cast<void*>(current_ptr), static_cast<void*>(new_ptr), copy_bytes, ctx);
515555

516556
// IMPORTANT: In a fully async implementation, we would need to chain the deallocation
517557
// to happen after the copy completes. However, since we don't have that mechanism yet,
@@ -571,11 +611,11 @@ void* op::reallocate<Src>::exec(void** ptr_ptr, std::size_t new_size)
571611
// Allocate new memory
572612
void* new_ptr = allocator.allocate(new_size);
573613

574-
// Calculate copy size (minimum of old and new size)
575-
std::size_t copy_size = (old_size > new_size) ? new_size : old_size;
614+
// Calculate copy size in bytes (minimum of old and new size)
615+
std::size_t copy_bytes = (old_size > new_size) ? new_size : old_size;
576616

577-
// Copy data from old to new location using auto-dispatch copy
578-
umpire::copy(current_ptr, new_ptr, copy_size);
617+
// Copy data from old to new location (void* naturally uses bytes)
618+
umpire::copy(static_cast<void*>(current_ptr), static_cast<void*>(new_ptr), copy_bytes);
579619

580620
// Deallocate old memory
581621
allocator.deallocate(current_ptr);
@@ -629,11 +669,11 @@ camp::resources::EventProxy<camp::resources::Resource> op::reallocate<Src>::exec
629669
// Allocate new memory
630670
void* new_ptr = allocator.allocate(new_size);
631671

632-
// Calculate copy size (minimum of old and new size)
633-
std::size_t copy_size = (old_size > new_size) ? new_size : old_size;
672+
// Calculate copy size in bytes (minimum of old and new size)
673+
std::size_t copy_bytes = (old_size > new_size) ? new_size : old_size;
634674

635-
// Copy data from old to new location asynchronously using auto-dispatch copy
636-
auto event = umpire::copy(current_ptr, new_ptr, copy_size, ctx);
675+
// Copy data from old to new location asynchronously (void* naturally uses bytes)
676+
auto event = umpire::copy(static_cast<void*>(current_ptr), static_cast<void*>(new_ptr), copy_bytes, ctx);
637677

638678
// Deallocate old memory
639679
allocator.deallocate(current_ptr);

include/umpire/op/host.hpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,5 +73,38 @@ struct memset<resource::host_platform> {
7373
}
7474
};
7575

76+
template <>
77+
struct prefetch<resource::host_platform> {
78+
/**
79+
* @brief Host memory prefetch (no-op)
80+
*
81+
* Prefetch is a no-op for host memory since the CPU has direct access.
82+
* This implementation exists for API compatibility and to avoid throwing errors.
83+
*
84+
* @tparam T Type of data being prefetched
85+
* @param ptr Pointer to memory (unused)
86+
* @param device Device ID (unused)
87+
* @param len Number of bytes to prefetch (unused)
88+
*/
89+
template <typename T>
90+
static void exec(T* /*ptr*/, int /*device*/, std::size_t /*len*/) noexcept
91+
{
92+
// No-op: CPU already has direct access to host memory
93+
}
94+
95+
/**
96+
* @brief Asynchronous host memory prefetch (no-op)
97+
*
98+
* Returns a completed event immediately since this is a no-op.
99+
*/
100+
template <typename T>
101+
static camp::resources::EventProxy<camp::resources::Resource> exec(T* /*ptr*/, int /*device*/, std::size_t /*len*/,
102+
camp::resources::Resource& resource) noexcept
103+
{
104+
// No-op: CPU already has direct access to host memory
105+
return detail::make_completed_event(resource);
106+
}
107+
};
108+
76109
} // namespace op
77110
} // namespace umpire

tests/op/copy_test.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,4 +145,68 @@ TEST(Copy, ZeroSize)
145145
allocator.deallocate(dest_ptr);
146146
}
147147

148+
// Test to validate that typed copy uses element counts, not bytes
149+
TEST(Copy, TypedCopySemantics)
150+
{
151+
constexpr std::size_t num_elements = 100;
152+
153+
auto& rm = umpire::ResourceManager::getInstance();
154+
auto allocator = rm.getAllocator("HOST");
155+
156+
// Allocate buffers for int*
157+
int* source = static_cast<int*>(allocator.allocate(num_elements * sizeof(int)));
158+
int* dest = static_cast<int*>(allocator.allocate(num_elements * sizeof(int)));
159+
160+
// Fill source with known pattern
161+
for (std::size_t i = 0; i < num_elements; ++i) {
162+
source[i] = static_cast<int>(i * 7 + 13);
163+
}
164+
165+
// Zero out destination
166+
std::memset(dest, 0, num_elements * sizeof(int));
167+
168+
// Copy using element count (not bytes)
169+
umpire::copy(source, dest, num_elements);
170+
171+
// Verify exactly num_elements were copied
172+
for (std::size_t i = 0; i < num_elements; ++i) {
173+
ASSERT_EQ(dest[i], static_cast<int>(i * 7 + 13)) << "Element " << i << " not copied correctly";
174+
}
175+
176+
// Cleanup
177+
allocator.deallocate(source);
178+
allocator.deallocate(dest);
179+
}
180+
181+
// Test to validate void* copy uses bytes, not elements
182+
TEST(Copy, VoidVsTypedSemantics)
183+
{
184+
constexpr std::size_t byte_count = 100;
185+
186+
auto& rm = umpire::ResourceManager::getInstance();
187+
auto allocator = rm.getAllocator("HOST");
188+
189+
// Allocate as void*
190+
void* void_src = allocator.allocate(byte_count);
191+
void* void_dst = allocator.allocate(byte_count);
192+
193+
// Fill with byte pattern
194+
std::memset(void_src, 0xAB, byte_count);
195+
std::memset(void_dst, 0xCD, byte_count);
196+
197+
// Copy exact byte count
198+
umpire::copy(void_src, void_dst, byte_count);
199+
200+
// Verify byte-for-byte copy
201+
unsigned char* src_bytes = static_cast<unsigned char*>(void_src);
202+
unsigned char* dst_bytes = static_cast<unsigned char*>(void_dst);
203+
for (std::size_t i = 0; i < byte_count; ++i) {
204+
ASSERT_EQ(dst_bytes[i], src_bytes[i]) << "Byte " << i << " not copied correctly";
205+
}
206+
207+
// Cleanup
208+
allocator.deallocate(void_src);
209+
allocator.deallocate(void_dst);
210+
}
211+
148212
// Additional tests for async copy operations could be added here

tests/op/memset_test.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,12 @@ TEST(Memset, TypedMemsetInt)
5656
ptr[i] = static_cast<int>(i);
5757
}
5858

59-
// Use umpire::memset to zero out
60-
umpire::memset(ptr, value, num_elements * sizeof(int));
59+
// Use umpire::memset to zero out (pass element count for typed pointer)
60+
umpire::memset(ptr, value, num_elements);
6161

62-
// Verify each byte is zero
63-
unsigned char* byte_ptr = static_cast<unsigned char*>(static_cast<void*>(ptr));
64-
for (std::size_t i = 0; i < num_elements * sizeof(int); ++i) {
65-
ASSERT_EQ(byte_ptr[i], 0) << "Memset failed at byte " << i;
62+
// Verify each int is zero
63+
for (std::size_t i = 0; i < num_elements; ++i) {
64+
ASSERT_EQ(ptr[i], 0) << "Memset failed at element " << i;
6665
}
6766

6867
// Cleanup

0 commit comments

Comments
 (0)