-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlibsmctrl.h
70 lines (62 loc) · 2.92 KB
/
libsmctrl.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
/**
* Copyright 2023 Joshua Bakita
* Library to control TPC masks on CUDA launches. Co-opts preexisting debug
* logic in the CUDA driver library, and thus requires a build with -lcuda.
*/
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
/* PARTITIONING FUNCTIONS */
// Set global default TPC mask for all kernels, incl. CUDA-internal ones
// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
// Supported: CUDA 10.2, and CUDA 11.0 - CUDA 12.1
extern void libsmctrl_set_global_mask(uint64_t mask);
// Set default TPC mask for all kernels launched via `stream`
// (overrides global mask)
// @param stream A cudaStream_t (aka CUstream_st*) to apply the mask on
// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
// Supported: CUDA 8.0 - CUDA 12.1
extern void libsmctrl_set_stream_mask(void* stream, uint64_t mask);
// Set TPC mask for the next kernel launch from the caller's CPU thread
// (overrides global and per-stream masks, applies only to next launch).
// @param mask A bitmask of enabled/disabled TPCs (see Notes on Bitmasks)
// Supported: CUDA 11.0 - CUDA 12.1
extern void libsmctrl_set_next_mask(uint64_t mask);
/**
* Notes on Bitmasks
*
* All of the core partitioning functions take a `uint64_t mask` parameter. A
* set bit in the mask indicates that the respective Thread Processing Cluster
* (TPC) is to be __disabled__.
*
* Examples
* To prohibit the next kernel from using TPC 0:
* libsmctrl_set_next_mask(0x1);
* Allow kernels to only use TPC 0 by default:
* libsmctrl_set_global_mask(~0x1ull);
* Allow kernels in a stream to only use TPCs 2, 3, and 4:
* libsmctrl_set_stream_mask(stream, ~0b00111100ull);
*
* Note that the bitwise inversion operator (~, as used above) is very useful,
* just be sure to apply it to 64-bit integer literals only! (~0x1 != ~0x1ull)
*/
/* INFORMATIONAL FUNCTIONS */
// Get number of GPCs for devices number `dev`, and a GPC-indexed array
// containing masks of which TPCs are associated with each GPC.
// Note that the `nvdebug` module must be loaded to use this function.
// @param num_enabled_gpcs (out) Location to store number of GPCs in
// @param tpcs_for_gpc (out) Pointer to store pointer to output buffer at
// @param dev (in) `nvdebug` device ID
// @return 0 on success, `errno`-compatible error code on failure
extern int libsmctrl_get_gpc_info(uint32_t* num_enabled_gpcs, uint64_t** tpcs_for_gpc, int dev);
// Get total number of TPCs on device number `dev`. Requires `nvdebug`.
// @param num_tpcs (out) Location to store number of TPCs at
// @param dev (in) `nvdebug` device ID
// @return 0 on success, `errno`-compatible error code on failure
extern int libsmctrl_get_tpc_info(uint32_t* num_tpcs, int dev);
// Identical to above, but for a CUDA device ID. Does not require `nvdebug`.
extern int libsmctrl_get_tpc_info_cuda(uint32_t* num_tpcs, int cuda_dev);
#ifdef __cplusplus
}
#endif