diff --git a/compiler/rustc_data_structures/src/profiling.rs b/compiler/rustc_data_structures/src/profiling.rs index e3a01e4035c49..2903155421c0d 100644 --- a/compiler/rustc_data_structures/src/profiling.rs +++ b/compiler/rustc_data_structures/src/profiling.rs @@ -88,6 +88,7 @@ use std::fmt::Display; use std::intrinsics::unlikely; use std::path::Path; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, Instant}; use std::{fs, process}; @@ -105,6 +106,8 @@ bitflags::bitflags! { struct EventFilter: u16 { const GENERIC_ACTIVITIES = 1 << 0; const QUERY_PROVIDERS = 1 << 1; + /// Store detailed instant events, including timestamp and thread ID, + /// per each query cache hit. Note that this is quite expensive. const QUERY_CACHE_HITS = 1 << 2; const QUERY_BLOCKED = 1 << 3; const INCR_CACHE_LOADS = 1 << 4; @@ -113,16 +116,20 @@ bitflags::bitflags! { const FUNCTION_ARGS = 1 << 6; const LLVM = 1 << 7; const INCR_RESULT_HASHING = 1 << 8; - const ARTIFACT_SIZES = 1 << 9; + const ARTIFACT_SIZES = 1 << 9; + /// Store aggregated counts of cache hits per query invocation. + const QUERY_CACHE_HIT_COUNTS = 1 << 10; const DEFAULT = Self::GENERIC_ACTIVITIES.bits() | Self::QUERY_PROVIDERS.bits() | Self::QUERY_BLOCKED.bits() | Self::INCR_CACHE_LOADS.bits() | Self::INCR_RESULT_HASHING.bits() | - Self::ARTIFACT_SIZES.bits(); + Self::ARTIFACT_SIZES.bits() | + Self::QUERY_CACHE_HIT_COUNTS.bits(); const ARGS = Self::QUERY_KEYS.bits() | Self::FUNCTION_ARGS.bits(); + const QUERY_CACHE_HIT_COMBINED = Self::QUERY_CACHE_HITS.bits() | Self::QUERY_CACHE_HIT_COUNTS.bits(); } } @@ -134,6 +141,7 @@ const EVENT_FILTERS_BY_NAME: &[(&str, EventFilter)] = &[ ("generic-activity", EventFilter::GENERIC_ACTIVITIES), ("query-provider", EventFilter::QUERY_PROVIDERS), ("query-cache-hit", EventFilter::QUERY_CACHE_HITS), + ("query-cache-hit-count", EventFilter::QUERY_CACHE_HITS), ("query-blocked", EventFilter::QUERY_BLOCKED), ("incr-cache-load", EventFilter::INCR_CACHE_LOADS), ("query-keys", EventFilter::QUERY_KEYS), @@ -411,13 +419,24 @@ impl SelfProfilerRef { #[inline(never)] #[cold] fn cold_call(profiler_ref: &SelfProfilerRef, query_invocation_id: QueryInvocationId) { - profiler_ref.instant_query_event( - |profiler| profiler.query_cache_hit_event_kind, - query_invocation_id, - ); + if profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) { + profiler_ref + .profiler + .as_ref() + .unwrap() + .increment_query_cache_hit_counters(QueryInvocationId(query_invocation_id.0)); + } + if unlikely(profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) { + profiler_ref.instant_query_event( + |profiler| profiler.query_cache_hit_event_kind, + query_invocation_id, + ); + } } - if unlikely(self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) { + // We check both kinds of query cache hit events at once, to reduce overhead in the + // common case (with self-profile disabled). + if unlikely(self.event_filter_mask.intersects(EventFilter::QUERY_CACHE_HIT_COMBINED)) { cold_call(self, query_invocation_id); } } @@ -489,6 +508,35 @@ impl SelfProfilerRef { self.profiler.as_ref().map(|p| p.get_or_alloc_cached_string(s)) } + /// Store query cache hits to the self-profile log. + /// Should be called once at the end of the compilation session. + /// + /// The cache hits are stored per **query invocation**, not **per query kind/type**. + /// `analyzeme` can later deduplicate individual query labels from the QueryInvocationId event + /// IDs. + pub fn store_query_cache_hits(&self) { + if self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) { + let profiler = self.profiler.as_ref().unwrap(); + let query_hits = profiler.query_hits.read(); + let builder = EventIdBuilder::new(&profiler.profiler); + let thread_id = get_thread_id(); + for (query_invocation, hit_count) in query_hits.iter().enumerate() { + let hit_count = hit_count.load(Ordering::Relaxed); + // No need to record empty cache hit counts + if hit_count > 0 { + let event_id = + builder.from_label(StringId::new_virtual(query_invocation as u64)); + profiler.profiler.record_integer_event( + profiler.query_cache_hit_count_event_kind, + event_id, + thread_id, + hit_count, + ); + } + } + } + } + #[inline] pub fn enabled(&self) -> bool { self.profiler.is_some() @@ -537,6 +585,19 @@ pub struct SelfProfiler { string_cache: RwLock>, + /// Recording individual query cache hits as "instant" measureme events + /// is incredibly expensive. Instead of doing that, we simply aggregate + /// cache hit *counts* per query invocation, and then store the final count + /// of cache hits per invocation at the end of the compilation session. + /// + /// With this approach, we don't know the individual thread IDs and timestamps + /// of cache hits, but it has very little overhead on top of `-Zself-profile`. + /// Recording the cache hits as individual events made compilation 3-5x slower. + /// + /// Query invocation IDs should be monotonic integers, so we can store them in a vec, + /// rather than using a hashmap. + query_hits: RwLock>, + query_event_kind: StringId, generic_activity_event_kind: StringId, incremental_load_result_event_kind: StringId, @@ -544,6 +605,8 @@ pub struct SelfProfiler { query_blocked_event_kind: StringId, query_cache_hit_event_kind: StringId, artifact_size_event_kind: StringId, + /// Total cache hits per query invocation + query_cache_hit_count_event_kind: StringId, } impl SelfProfiler { @@ -573,6 +636,7 @@ impl SelfProfiler { let query_blocked_event_kind = profiler.alloc_string("QueryBlocked"); let query_cache_hit_event_kind = profiler.alloc_string("QueryCacheHit"); let artifact_size_event_kind = profiler.alloc_string("ArtifactSize"); + let query_cache_hit_count_event_kind = profiler.alloc_string("QueryCacheHitCount"); let mut event_filter_mask = EventFilter::empty(); @@ -618,6 +682,8 @@ impl SelfProfiler { query_blocked_event_kind, query_cache_hit_event_kind, artifact_size_event_kind, + query_cache_hit_count_event_kind, + query_hits: Default::default(), }) } @@ -627,6 +693,25 @@ impl SelfProfiler { self.profiler.alloc_string(s) } + /// Store a cache hit of a query invocation + pub fn increment_query_cache_hit_counters(&self, id: QueryInvocationId) { + // Fast path: assume that the query was already encountered before, and just record + // a cache hit. + let mut guard = self.query_hits.upgradable_read(); + let query_hits = &guard; + let index = id.0 as usize; + if index < query_hits.len() { + // We only want to increment the count, no other synchronization is required + query_hits[index].fetch_add(1, Ordering::Relaxed); + } else { + // If not, we need to extend the query hit map to the highest observed ID + guard.with_upgraded(|vec| { + vec.resize_with(index + 1, || AtomicU64::new(0)); + vec[index] = AtomicU64::from(1); + }); + } + } + /// Gets a `StringId` for the given string. This method makes sure that /// any strings going through it will only be allocated once in the /// profiling data. diff --git a/compiler/rustc_query_impl/src/profiling_support.rs b/compiler/rustc_query_impl/src/profiling_support.rs index 7d80e54bf87e6..7be75ea88acad 100644 --- a/compiler/rustc_query_impl/src/profiling_support.rs +++ b/compiler/rustc_query_impl/src/profiling_support.rs @@ -259,4 +259,5 @@ pub fn alloc_self_profile_query_strings(tcx: TyCtxt<'_>) { for alloc in super::ALLOC_SELF_PROFILE_QUERY_STRINGS.iter() { alloc(tcx, &mut string_cache) } + tcx.sess.prof.store_query_cache_hits(); }