diff --git a/compiler/rustc_data_structures/src/profiling.rs b/compiler/rustc_data_structures/src/profiling.rs
index e3a01e4035c49..2903155421c0d 100644
--- a/compiler/rustc_data_structures/src/profiling.rs
+++ b/compiler/rustc_data_structures/src/profiling.rs
@@ -88,6 +88,7 @@ use std::fmt::Display;
 use std::intrinsics::unlikely;
 use std::path::Path;
 use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
 use std::time::{Duration, Instant};
 use std::{fs, process};
 
@@ -105,6 +106,8 @@ bitflags::bitflags! {
     struct EventFilter: u16 {
         const GENERIC_ACTIVITIES  = 1 << 0;
         const QUERY_PROVIDERS     = 1 << 1;
+        /// Store detailed instant events, including timestamp and thread ID,
+        /// per each query cache hit. Note that this is quite expensive.
         const QUERY_CACHE_HITS    = 1 << 2;
         const QUERY_BLOCKED       = 1 << 3;
         const INCR_CACHE_LOADS    = 1 << 4;
@@ -113,16 +116,20 @@ bitflags::bitflags! {
         const FUNCTION_ARGS       = 1 << 6;
         const LLVM                = 1 << 7;
         const INCR_RESULT_HASHING = 1 << 8;
-        const ARTIFACT_SIZES = 1 << 9;
+        const ARTIFACT_SIZES      = 1 << 9;
+        /// Store aggregated counts of cache hits per query invocation.
+        const QUERY_CACHE_HIT_COUNTS  = 1 << 10;
 
         const DEFAULT = Self::GENERIC_ACTIVITIES.bits() |
                         Self::QUERY_PROVIDERS.bits() |
                         Self::QUERY_BLOCKED.bits() |
                         Self::INCR_CACHE_LOADS.bits() |
                         Self::INCR_RESULT_HASHING.bits() |
-                        Self::ARTIFACT_SIZES.bits();
+                        Self::ARTIFACT_SIZES.bits() |
+                        Self::QUERY_CACHE_HIT_COUNTS.bits();
 
         const ARGS = Self::QUERY_KEYS.bits() | Self::FUNCTION_ARGS.bits();
+        const QUERY_CACHE_HIT_COMBINED = Self::QUERY_CACHE_HITS.bits() | Self::QUERY_CACHE_HIT_COUNTS.bits();
     }
 }
 
@@ -134,6 +141,7 @@ const EVENT_FILTERS_BY_NAME: &[(&str, EventFilter)] = &[
     ("generic-activity", EventFilter::GENERIC_ACTIVITIES),
     ("query-provider", EventFilter::QUERY_PROVIDERS),
     ("query-cache-hit", EventFilter::QUERY_CACHE_HITS),
+    ("query-cache-hit-count", EventFilter::QUERY_CACHE_HITS),
     ("query-blocked", EventFilter::QUERY_BLOCKED),
     ("incr-cache-load", EventFilter::INCR_CACHE_LOADS),
     ("query-keys", EventFilter::QUERY_KEYS),
@@ -411,13 +419,24 @@ impl SelfProfilerRef {
         #[inline(never)]
         #[cold]
         fn cold_call(profiler_ref: &SelfProfilerRef, query_invocation_id: QueryInvocationId) {
-            profiler_ref.instant_query_event(
-                |profiler| profiler.query_cache_hit_event_kind,
-                query_invocation_id,
-            );
+            if profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) {
+                profiler_ref
+                    .profiler
+                    .as_ref()
+                    .unwrap()
+                    .increment_query_cache_hit_counters(QueryInvocationId(query_invocation_id.0));
+            }
+            if unlikely(profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
+                profiler_ref.instant_query_event(
+                    |profiler| profiler.query_cache_hit_event_kind,
+                    query_invocation_id,
+                );
+            }
         }
 
-        if unlikely(self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
+        // We check both kinds of query cache hit events at once, to reduce overhead in the
+        // common case (with self-profile disabled).
+        if unlikely(self.event_filter_mask.intersects(EventFilter::QUERY_CACHE_HIT_COMBINED)) {
             cold_call(self, query_invocation_id);
         }
     }
@@ -489,6 +508,35 @@ impl SelfProfilerRef {
         self.profiler.as_ref().map(|p| p.get_or_alloc_cached_string(s))
     }
 
+    /// Store query cache hits to the self-profile log.
+    /// Should be called once at the end of the compilation session.
+    ///
+    /// The cache hits are stored per **query invocation**, not **per query kind/type**.
+    /// `analyzeme` can later deduplicate individual query labels from the QueryInvocationId event
+    /// IDs.
+    pub fn store_query_cache_hits(&self) {
+        if self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) {
+            let profiler = self.profiler.as_ref().unwrap();
+            let query_hits = profiler.query_hits.read();
+            let builder = EventIdBuilder::new(&profiler.profiler);
+            let thread_id = get_thread_id();
+            for (query_invocation, hit_count) in query_hits.iter().enumerate() {
+                let hit_count = hit_count.load(Ordering::Relaxed);
+                // No need to record empty cache hit counts
+                if hit_count > 0 {
+                    let event_id =
+                        builder.from_label(StringId::new_virtual(query_invocation as u64));
+                    profiler.profiler.record_integer_event(
+                        profiler.query_cache_hit_count_event_kind,
+                        event_id,
+                        thread_id,
+                        hit_count,
+                    );
+                }
+            }
+        }
+    }
+
     #[inline]
     pub fn enabled(&self) -> bool {
         self.profiler.is_some()
@@ -537,6 +585,19 @@ pub struct SelfProfiler {
 
     string_cache: RwLock<FxHashMap<String, StringId>>,
 
+    /// Recording individual query cache hits as "instant" measureme events
+    /// is incredibly expensive. Instead of doing that, we simply aggregate
+    /// cache hit *counts* per query invocation, and then store the final count
+    /// of cache hits per invocation at the end of the compilation session.
+    ///
+    /// With this approach, we don't know the individual thread IDs and timestamps
+    /// of cache hits, but it has very little overhead on top of `-Zself-profile`.
+    /// Recording the cache hits as individual events made compilation 3-5x slower.
+    ///
+    /// Query invocation IDs should be monotonic integers, so we can store them in a vec,
+    /// rather than using a hashmap.
+    query_hits: RwLock<Vec<AtomicU64>>,
+
     query_event_kind: StringId,
     generic_activity_event_kind: StringId,
     incremental_load_result_event_kind: StringId,
@@ -544,6 +605,8 @@ pub struct SelfProfiler {
     query_blocked_event_kind: StringId,
     query_cache_hit_event_kind: StringId,
     artifact_size_event_kind: StringId,
+    /// Total cache hits per query invocation
+    query_cache_hit_count_event_kind: StringId,
 }
 
 impl SelfProfiler {
@@ -573,6 +636,7 @@ impl SelfProfiler {
         let query_blocked_event_kind = profiler.alloc_string("QueryBlocked");
         let query_cache_hit_event_kind = profiler.alloc_string("QueryCacheHit");
         let artifact_size_event_kind = profiler.alloc_string("ArtifactSize");
+        let query_cache_hit_count_event_kind = profiler.alloc_string("QueryCacheHitCount");
 
         let mut event_filter_mask = EventFilter::empty();
 
@@ -618,6 +682,8 @@ impl SelfProfiler {
             query_blocked_event_kind,
             query_cache_hit_event_kind,
             artifact_size_event_kind,
+            query_cache_hit_count_event_kind,
+            query_hits: Default::default(),
         })
     }
 
@@ -627,6 +693,25 @@ impl SelfProfiler {
         self.profiler.alloc_string(s)
     }
 
+    /// Store a cache hit of a query invocation
+    pub fn increment_query_cache_hit_counters(&self, id: QueryInvocationId) {
+        // Fast path: assume that the query was already encountered before, and just record
+        // a cache hit.
+        let mut guard = self.query_hits.upgradable_read();
+        let query_hits = &guard;
+        let index = id.0 as usize;
+        if index < query_hits.len() {
+            // We only want to increment the count, no other synchronization is required
+            query_hits[index].fetch_add(1, Ordering::Relaxed);
+        } else {
+            // If not, we need to extend the query hit map to the highest observed ID
+            guard.with_upgraded(|vec| {
+                vec.resize_with(index + 1, || AtomicU64::new(0));
+                vec[index] = AtomicU64::from(1);
+            });
+        }
+    }
+
     /// Gets a `StringId` for the given string. This method makes sure that
     /// any strings going through it will only be allocated once in the
     /// profiling data.
diff --git a/compiler/rustc_query_impl/src/profiling_support.rs b/compiler/rustc_query_impl/src/profiling_support.rs
index 7d80e54bf87e6..7be75ea88acad 100644
--- a/compiler/rustc_query_impl/src/profiling_support.rs
+++ b/compiler/rustc_query_impl/src/profiling_support.rs
@@ -259,4 +259,5 @@ pub fn alloc_self_profile_query_strings(tcx: TyCtxt<'_>) {
     for alloc in super::ALLOC_SELF_PROFILE_QUERY_STRINGS.iter() {
         alloc(tcx, &mut string_cache)
     }
+    tcx.sess.prof.store_query_cache_hits();
 }