Skip to content

Commit 7e6a088

Browse files
jan-wassenbergcopybara-github
authored andcommitted
Windows workaround: docs fail to mention GroupCount requirements. Fixes #2734
PiperOrigin-RevId: 813164942
1 parent 0913de4 commit 7e6a088

File tree

1 file changed

+27
-4
lines changed

1 file changed

+27
-4
lines changed

hwy/contrib/thread_pool/topology.cc

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,9 @@ bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) {
114114
}
115115
HWY_ASSERT(GetLastError() == ERROR_INSUFFICIENT_BUFFER);
116116
// Note: `buf_bytes` may be less than `sizeof(SLPI)`, which has padding.
117-
uint8_t* buf = static_cast<uint8_t*>(malloc(buf_bytes));
117+
// `calloc` zero-initializes the `Reserved` field, part of which has been
118+
// repurposed into `GroupCount` in SDKs, 10.0.22000.0 or possibly earlier.
119+
uint8_t* buf = static_cast<uint8_t*>(calloc(1, buf_bytes));
118120
HWY_ASSERT(buf);
119121

120122
// Fill the buffer.
@@ -658,6 +660,27 @@ void SetClusterCacheSizes(std::vector<Topology::Package>& packages) {
658660

659661
#elif HWY_OS_WIN
660662

663+
// See #2734. GroupCount was added around Windows 10, but SDK docs do not
664+
// mention the actual version required. It is known to be absent in 8.1 and
665+
// MinGW 5.0.1, and present in the 10.0.22000.0 SDK. However, the OS must also
666+
// know about the field. Thus we zero-initialize the reserved field, assume it
667+
// remains zero, and return 1 if zero (old style single GroupMask), otherwise
668+
// the number of groups. There are two such structures, but note that
669+
// `PROCESSOR_RELATIONSHIP` already had this field.
670+
static size_t GroupCount(const CACHE_RELATIONSHIP& cr) {
671+
// Added as the last u16 in the reserved area before GroupMask. We only read
672+
// one byte because 256*64 processor bits are plenty.
673+
const uint8_t* pcount =
674+
reinterpret_cast<const uint8_t*>(&cr.GroupMask) - sizeof(uint16_t);
675+
return HWY_MAX(pcount[HWY_IS_BIG_ENDIAN], 1);
676+
}
677+
678+
static size_t GroupCount(const NUMA_NODE_RELATIONSHIP& nn) {
679+
const uint8_t* pcount =
680+
reinterpret_cast<const uint8_t*>(&nn.GroupMask) - sizeof(uint16_t);
681+
return HWY_MAX(pcount[HWY_IS_BIG_ENDIAN], 1);
682+
}
683+
661684
// Also sets LP.core and LP.smt.
662685
size_t MaxLpsPerCore(std::vector<Topology::LP>& lps) {
663686
size_t max_lps_per_core = 0;
@@ -711,7 +734,7 @@ size_t MaxCoresPerCluster(const size_t max_lps_per_core,
711734
const CACHE_RELATIONSHIP& cr = info.Cache;
712735
if (cr.Type != CacheUnified && cr.Type != CacheData) return;
713736
if (cr.Level != 3) return;
714-
foreach_cluster(cr.GroupCount, cr.GroupMasks);
737+
foreach_cluster(GroupCount(cr), cr.GroupMasks);
715738
};
716739

717740
if (!ForEachSLPI(RelationProcessorDie, foreach_die)) {
@@ -768,7 +791,7 @@ void SetNodes(std::vector<Topology::LP>& lps) {
768791
if (info.Relationship != RelationNumaNode) return;
769792
const NUMA_NODE_RELATIONSHIP& nn = info.NumaNode;
770793
// This field was previously reserved/zero. There is at least one group.
771-
const size_t num_groups = HWY_MAX(1, nn.GroupCount);
794+
const size_t num_groups = HWY_MAX(1, GroupCount(nn));
772795
const uint8_t node = static_cast<uint8_t>(nn.NodeNumber);
773796
ForeachBit(num_groups, nn.GroupMasks, lps, __LINE__,
774797
[node](size_t lp, std::vector<Topology::LP>& lps) {
@@ -1027,7 +1050,7 @@ bool InitCachesWin(Caches& caches) {
10271050
: cr.Associativity;
10281051

10291052
// How many cores share this cache?
1030-
size_t shared_with = NumBits(cr.GroupCount, cr.GroupMasks);
1053+
size_t shared_with = NumBits(GroupCount(cr), cr.GroupMasks);
10311054
// Divide out hyperthreads. This core may have fewer than
10321055
// `max_lps_per_core`, hence round up.
10331056
shared_with = DivCeil(shared_with, max_lps_per_core);

0 commit comments

Comments
 (0)