Skip to content

Commit 14c6835

Browse files
committed
rustc: Wait for all codegen threads to exit
This commit updates rustc to wait for all codegen threads to exit before allowing the main thread to exit. This is a stab in the dark to fix the mysterious segfaults appearing on #55238, and hopefully we'll see whether this actually fixes things in practice...
1 parent 016eaf8 commit 14c6835

File tree

3 files changed

+104
-22
lines changed

3 files changed

+104
-22
lines changed

src/Cargo.lock

-12
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,6 @@ dependencies = [
1515
"rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)",
1616
]
1717

18-
[[package]]
19-
name = "alloc_jemalloc"
20-
version = "0.0.0"
21-
dependencies = [
22-
"build_helper 0.1.0",
23-
"cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)",
24-
"compiler_builtins 0.0.0",
25-
"core 0.0.0",
26-
"libc 0.0.0",
27-
]
28-
2918
[[package]]
3019
name = "alloc_system"
3120
version = "0.0.0"
@@ -2696,7 +2685,6 @@ name = "std"
26962685
version = "0.0.0"
26972686
dependencies = [
26982687
"alloc 0.0.0",
2699-
"alloc_jemalloc 0.0.0",
27002688
"alloc_system 0.0.0",
27012689
"build_helper 0.1.0",
27022690
"cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)",

src/librustc_codegen_llvm/back/write.rs

+51-6
Original file line numberDiff line numberDiff line change
@@ -1508,6 +1508,7 @@ enum Message {
15081508
},
15091509
CodegenComplete,
15101510
CodegenItem,
1511+
CodegenAborted,
15111512
}
15121513

15131514
struct Diagnostic {
@@ -1788,6 +1789,7 @@ fn start_executing_work(tcx: TyCtxt,
17881789
let mut needs_lto = Vec::new();
17891790
let mut lto_import_only_modules = Vec::new();
17901791
let mut started_lto = false;
1792+
let mut codegen_aborted = false;
17911793

17921794
// This flag tracks whether all items have gone through codegens
17931795
let mut codegen_done = false;
@@ -1805,13 +1807,19 @@ fn start_executing_work(tcx: TyCtxt,
18051807
let mut llvm_start_time = None;
18061808

18071809
// Run the message loop while there's still anything that needs message
1808-
// processing:
1810+
// processing. Note that as soon as codegen is aborted we simply want to
1811+
// wait for all existing work to finish, so many of the conditions here
1812+
// only apply if codegen hasn't been aborted as they represent pending
1813+
// work to be done.
18091814
while !codegen_done ||
1810-
work_items.len() > 0 ||
18111815
running > 0 ||
1812-
needs_lto.len() > 0 ||
1813-
lto_import_only_modules.len() > 0 ||
1814-
main_thread_worker_state != MainThreadWorkerState::Idle {
1816+
(!codegen_aborted && (
1817+
work_items.len() > 0 ||
1818+
needs_lto.len() > 0 ||
1819+
lto_import_only_modules.len() > 0 ||
1820+
main_thread_worker_state != MainThreadWorkerState::Idle
1821+
))
1822+
{
18151823

18161824
// While there are still CGUs to be codegened, the coordinator has
18171825
// to decide how to utilize the compiler processes implicit Token:
@@ -1840,6 +1848,9 @@ fn start_executing_work(tcx: TyCtxt,
18401848
spawn_work(cgcx, item);
18411849
}
18421850
}
1851+
} else if codegen_aborted {
1852+
// don't queue up any more work if codegen was aborted, we're
1853+
// just waiting for our existing children to finish
18431854
} else {
18441855
// If we've finished everything related to normal codegen
18451856
// then it must be the case that we've got some LTO work to do.
@@ -1904,7 +1915,7 @@ fn start_executing_work(tcx: TyCtxt,
19041915

19051916
// Spin up what work we can, only doing this while we've got available
19061917
// parallelism slots and work left to spawn.
1907-
while work_items.len() > 0 && running < tokens.len() {
1918+
while !codegen_aborted && work_items.len() > 0 && running < tokens.len() {
19081919
let (item, _) = work_items.pop().unwrap();
19091920

19101921
maybe_start_llvm_timer(cgcx.config(item.module_kind()),
@@ -1969,18 +1980,34 @@ fn start_executing_work(tcx: TyCtxt,
19691980
if !cgcx.opts.debugging_opts.no_parallel_llvm {
19701981
helper.request_token();
19711982
}
1983+
assert!(!codegen_aborted);
19721984
assert_eq!(main_thread_worker_state,
19731985
MainThreadWorkerState::Codegenning);
19741986
main_thread_worker_state = MainThreadWorkerState::Idle;
19751987
}
19761988

19771989
Message::CodegenComplete => {
19781990
codegen_done = true;
1991+
assert!(!codegen_aborted);
19791992
assert_eq!(main_thread_worker_state,
19801993
MainThreadWorkerState::Codegenning);
19811994
main_thread_worker_state = MainThreadWorkerState::Idle;
19821995
}
19831996

1997+
// If codegen is aborted that means translation was aborted due
1998+
// to some normal-ish compiler error. In this situation we want
1999+
// to exit as soon as possible, but we want to make sure all
2000+
// existing work has finished. Flag codegen as being done, and
2001+
// then conditions above will ensure no more work is spawned but
2002+
// we'll keep executing this loop until `running` hits 0.
2003+
Message::CodegenAborted => {
2004+
assert!(!codegen_aborted);
2005+
codegen_done = true;
2006+
codegen_aborted = true;
2007+
assert_eq!(main_thread_worker_state,
2008+
MainThreadWorkerState::Codegenning);
2009+
}
2010+
19842011
// If a thread exits successfully then we drop a token associated
19852012
// with that worker and update our `running` count. We may later
19862013
// re-acquire a token to continue running more work. We may also not
@@ -2446,6 +2473,19 @@ impl OngoingCodegen {
24462473
drop(self.coordinator_send.send(Box::new(Message::CodegenComplete)));
24472474
}
24482475

2476+
/// Consume this context indicating that codegen was entirely aborted, and
2477+
/// we need to exit as quickly as possible.
2478+
///
2479+
/// This method blocks the current thread until all worker threads have
2480+
/// finished, and all worker threads should have exited or be real close to
2481+
/// exiting at this point.
2482+
pub fn codegen_aborted(self) {
2483+
// Signal to the coordinator it should spawn no more work and start
2484+
// shutdown.
2485+
drop(self.coordinator_send.send(Box::new(Message::CodegenAborted)));
2486+
drop(self.future.join());
2487+
}
2488+
24492489
pub fn check_for_errors(&self, sess: &Session) {
24502490
self.shared_emitter_main.check(sess, false);
24512491
}
@@ -2464,6 +2504,11 @@ impl OngoingCodegen {
24642504
}
24652505
}
24662506

2507+
// impl Drop for OngoingCodegen {
2508+
// fn drop(&mut self) {
2509+
// }
2510+
// }
2511+
24672512
pub(crate) fn submit_codegened_module_to_llvm(tcx: TyCtxt,
24682513
module: ModuleCodegen,
24692514
cost: u64) {

src/librustc_codegen_llvm/base.rs

+53-4
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,13 @@ use rustc_data_structures::small_c_str::SmallCStr;
7676
use rustc_data_structures::sync::Lrc;
7777

7878
use std::any::Any;
79+
use std::cmp;
7980
use std::ffi::CString;
80-
use std::sync::Arc;
81-
use std::time::{Instant, Duration};
8281
use std::i32;
83-
use std::cmp;
82+
use std::ops::{Deref, DerefMut};
83+
use std::sync::Arc;
8484
use std::sync::mpsc;
85+
use std::time::{Instant, Duration};
8586
use syntax_pos::Span;
8687
use syntax_pos::symbol::InternedString;
8788
use syntax::attr;
@@ -820,6 +821,7 @@ pub fn codegen_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
820821
metadata,
821822
rx,
822823
codegen_units.len());
824+
let ongoing_codegen = AbortCodegenOnDrop(Some(ongoing_codegen));
823825

824826
// Codegen an allocator shim, if necessary.
825827
//
@@ -949,7 +951,54 @@ pub fn codegen_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
949951
ongoing_codegen.check_for_errors(tcx.sess);
950952

951953
assert_and_save_dep_graph(tcx);
952-
ongoing_codegen
954+
ongoing_codegen.into_inner()
955+
}
956+
957+
/// A curious wrapper structure whose only purpose is to call `codegen_aborted`
958+
/// when it's dropped abnormally.
959+
///
960+
/// In the process of working on rust-lang/rust#55238 a mysterious segfault was
961+
/// stumbled upon. The segfault was never reproduced locally, but it was
962+
/// suspected to be releated to the fact that codegen worker threads were
963+
/// sticking around by the time the main thread was exiting, causing issues.
964+
///
965+
/// This structure is an attempt to fix that issue where the `codegen_aborted`
966+
/// message will block until all workers have finished. This should ensure that
967+
/// even if the main codegen thread panics we'll wait for pending work to
968+
/// complete before returning from the main thread, hopefully avoiding
969+
/// segfaults.
970+
///
971+
/// If you see this comment in the code, then it means that this workaround
972+
/// worked! We may yet one day track down the mysterious cause of that
973+
/// segfault...
974+
struct AbortCodegenOnDrop(Option<OngoingCodegen>);
975+
976+
impl AbortCodegenOnDrop {
977+
fn into_inner(mut self) -> OngoingCodegen {
978+
self.0.take().unwrap()
979+
}
980+
}
981+
982+
impl Deref for AbortCodegenOnDrop {
983+
type Target = OngoingCodegen;
984+
985+
fn deref(&self) -> &OngoingCodegen {
986+
self.0.as_ref().unwrap()
987+
}
988+
}
989+
990+
impl DerefMut for AbortCodegenOnDrop {
991+
fn deref_mut(&mut self) -> &mut OngoingCodegen {
992+
self.0.as_mut().unwrap()
993+
}
994+
}
995+
996+
impl Drop for AbortCodegenOnDrop {
997+
fn drop(&mut self) {
998+
if let Some(codegen) = self.0.take() {
999+
codegen.codegen_aborted();
1000+
}
1001+
}
9531002
}
9541003

9551004
fn assert_and_save_dep_graph<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>) {

0 commit comments

Comments
 (0)