@@ -114,7 +114,9 @@ bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) {
114114 }
115115 HWY_ASSERT (GetLastError () == ERROR_INSUFFICIENT_BUFFER);
116116 // Note: `buf_bytes` may be less than `sizeof(SLPI)`, which has padding.
117- uint8_t * buf = static_cast <uint8_t *>(malloc (buf_bytes));
117+ // `calloc` zero-initializes the `Reserved` field, part of which has been
118+ // repurposed into `GroupCount` in SDKs, 10.0.22000.0 or possibly earlier.
119+ uint8_t * buf = static_cast <uint8_t *>(calloc (1 , buf_bytes));
118120 HWY_ASSERT (buf);
119121
120122 // Fill the buffer.
@@ -658,6 +660,27 @@ void SetClusterCacheSizes(std::vector<Topology::Package>& packages) {
658660
659661#elif HWY_OS_WIN
660662
663+ // See #2734. GroupCount was added around Windows 10, but SDK docs do not
664+ // mention the actual version required. It is known to be absent in 8.1 and
665+ // MinGW 5.0.1, and present in the 10.0.22000.0 SDK. However, the OS must also
666+ // know about the field. Thus we zero-initialize the reserved field, assume it
667+ // remains zero, and return 1 if zero (old style single GroupMask), otherwise
668+ // the number of groups. There are two such structures, but note that
669+ // `PROCESSOR_RELATIONSHIP` already had this field.
670+ static size_t GroupCount (const CACHE_RELATIONSHIP& cr) {
671+ // Added as the last u16 in the reserved area before GroupMask. We only read
672+ // one byte because 256*64 processor bits are plenty.
673+ const uint8_t * pcount =
674+ reinterpret_cast <const uint8_t *>(&cr.GroupMask ) - sizeof (uint16_t );
675+ return HWY_MAX (pcount[HWY_IS_BIG_ENDIAN], 1 );
676+ }
677+
678+ static size_t GroupCount (const NUMA_NODE_RELATIONSHIP& nn) {
679+ const uint8_t * pcount =
680+ reinterpret_cast <const uint8_t *>(&nn.GroupMask ) - sizeof (uint16_t );
681+ return HWY_MAX (pcount[HWY_IS_BIG_ENDIAN], 1 );
682+ }
683+
661684// Also sets LP.core and LP.smt.
662685size_t MaxLpsPerCore (std::vector<Topology::LP>& lps) {
663686 size_t max_lps_per_core = 0 ;
@@ -711,7 +734,7 @@ size_t MaxCoresPerCluster(const size_t max_lps_per_core,
711734 const CACHE_RELATIONSHIP& cr = info.Cache ;
712735 if (cr.Type != CacheUnified && cr.Type != CacheData) return ;
713736 if (cr.Level != 3 ) return ;
714- foreach_cluster (cr. GroupCount , cr.GroupMasks );
737+ foreach_cluster (GroupCount (cr) , cr.GroupMasks );
715738 };
716739
717740 if (!ForEachSLPI (RelationProcessorDie, foreach_die)) {
@@ -768,7 +791,7 @@ void SetNodes(std::vector<Topology::LP>& lps) {
768791 if (info.Relationship != RelationNumaNode) return ;
769792 const NUMA_NODE_RELATIONSHIP& nn = info.NumaNode ;
770793 // This field was previously reserved/zero. There is at least one group.
771- const size_t num_groups = HWY_MAX (1 , nn. GroupCount );
794+ const size_t num_groups = HWY_MAX (1 , GroupCount (nn) );
772795 const uint8_t node = static_cast <uint8_t >(nn.NodeNumber );
773796 ForeachBit (num_groups, nn.GroupMasks , lps, __LINE__,
774797 [node](size_t lp, std::vector<Topology::LP>& lps) {
@@ -1027,7 +1050,7 @@ bool InitCachesWin(Caches& caches) {
10271050 : cr.Associativity ;
10281051
10291052 // How many cores share this cache?
1030- size_t shared_with = NumBits (cr. GroupCount , cr.GroupMasks );
1053+ size_t shared_with = NumBits (GroupCount (cr) , cr.GroupMasks );
10311054 // Divide out hyperthreads. This core may have fewer than
10321055 // `max_lps_per_core`, hence round up.
10331056 shared_with = DivCeil (shared_with, max_lps_per_core);
0 commit comments