From cd1d393e2b81240d73b1b4b53cd3935f86a71afd Mon Sep 17 00:00:00 2001
From: Ayke van Laethem <aykevanlaethem@gmail.com>
Date: Thu, 3 Apr 2025 13:29:55 +0200
Subject: [PATCH] WIP baremetal multicore RISC-V

---
 builder/sizes.go                              |   2 +-
 compileopts/config.go                         |   5 +
 compileopts/options.go                        |   2 +-
 compileopts/options_test.go                   |   2 +-
 src/device/riscv/start.S                      |  39 +++
 src/internal/task/atomic-cooperative.go       |   2 +-
 src/internal/task/atomic-preemptive.go        |   2 +-
 src/internal/task/futex-cooperative.go        |   2 +-
 src/internal/task/futex-cores.go              |  64 ++++
 .../{futex-preemptive.go => futex-threads.go} |   0
 src/internal/task/mutex-cooperative.go        |   2 +-
 src/internal/task/mutex-preemptive.go         |   2 +-
 src/internal/task/pmutex-cooperative.go       |   2 +-
 src/internal/task/pmutex-preemptive.go        |   2 +-
 src/internal/task/task.go                     |  17 +
 src/internal/task/task_stack.go               |  35 +-
 src/internal/task/task_stack_multicore.go     |  53 +++
 src/internal/task/task_stack_tinygoriscv.go   |  19 +-
 src/internal/task/task_stack_unicore.go       |  37 +++
 src/runtime/atomics_critical.go               |  61 ++--
 src/runtime/gc_stack_cores.go                 |  11 +
 src/runtime/gc_stack_raw.go                   |   2 +-
 src/runtime/print.go                          |  13 -
 src/runtime/runtime_tinygoriscv_qemu.go       | 237 +++++++++++--
 src/runtime/scheduler_cooperative.go          |  16 +
 src/runtime/scheduler_cores.go                | 314 ++++++++++++++++++
 src/runtime/scheduler_none.go                 |  13 +-
 src/runtime/scheduler_tasks.go                |  10 +
 src/runtime/scheduler_threads.go              |  11 +
 targets/riscv-qemu.json                       |   9 +-
 targets/riscv.ld                              |  23 +-
 .../gen-critical-atomics.go                   |  21 +-
 32 files changed, 899 insertions(+), 131 deletions(-)
 create mode 100644 src/internal/task/futex-cores.go
 rename src/internal/task/{futex-preemptive.go => futex-threads.go} (100%)
 create mode 100644 src/internal/task/task_stack_multicore.go
 create mode 100644 src/internal/task/task_stack_unicore.go
 create mode 100644 src/runtime/gc_stack_cores.go
 create mode 100644 src/runtime/scheduler_cores.go

diff --git a/builder/sizes.go b/builder/sizes.go
index 485a652d97..57fb36df67 100644
--- a/builder/sizes.go
+++ b/builder/sizes.go
@@ -490,7 +490,7 @@ func loadProgramSize(path string, packagePathMap map[string]string) (*programSiz
 				continue
 			}
 			if section.Type == elf.SHT_NOBITS {
-				if section.Name == ".stack" {
+				if strings.HasPrefix(section.Name, ".stack") {
 					// TinyGo emits stack sections on microcontroller using the
 					// ".stack" name.
 					// This is a bit ugly, but I don't think there is a way to
diff --git a/compileopts/config.go b/compileopts/config.go
index d05111f2b0..e1fb27f66e 100644
--- a/compileopts/config.go
+++ b/compileopts/config.go
@@ -110,6 +110,11 @@ func (c *Config) BuildTags() []string {
 		"math_big_pure_go",                           // to get math/big to work
 		"gc." + c.GC(), "scheduler." + c.Scheduler(), // used inside the runtime package
 		"serial." + c.Serial()}...) // used inside the machine package
+	switch c.Scheduler() {
+	case "threads", "cores":
+	default:
+		tags = append(tags, "tinygo.unicore")
+	}
 	for i := 1; i <= c.GoMinorVersion; i++ {
 		tags = append(tags, fmt.Sprintf("go1.%d", i))
 	}
diff --git a/compileopts/options.go b/compileopts/options.go
index ddad0b8795..517664db2c 100644
--- a/compileopts/options.go
+++ b/compileopts/options.go
@@ -10,7 +10,7 @@ import (
 var (
 	validBuildModeOptions     = []string{"default", "c-shared", "wasi-legacy"}
 	validGCOptions            = []string{"none", "leaking", "conservative", "custom", "precise", "boehm"}
-	validSchedulerOptions     = []string{"none", "tasks", "asyncify", "threads"}
+	validSchedulerOptions     = []string{"none", "tasks", "asyncify", "threads", "cores"}
 	validSerialOptions        = []string{"none", "uart", "usb", "rtt"}
 	validPrintSizeOptions     = []string{"none", "short", "full", "html"}
 	validPanicStrategyOptions = []string{"print", "trap"}
diff --git a/compileopts/options_test.go b/compileopts/options_test.go
index e75c10d767..dd098e6c4a 100644
--- a/compileopts/options_test.go
+++ b/compileopts/options_test.go
@@ -10,7 +10,7 @@ import (
 func TestVerifyOptions(t *testing.T) {
 
 	expectedGCError := errors.New(`invalid gc option 'incorrect': valid values are none, leaking, conservative, custom, precise, boehm`)
-	expectedSchedulerError := errors.New(`invalid scheduler option 'incorrect': valid values are none, tasks, asyncify, threads`)
+	expectedSchedulerError := errors.New(`invalid scheduler option 'incorrect': valid values are none, tasks, asyncify, threads, cores`)
 	expectedPrintSizeError := errors.New(`invalid size option 'incorrect': valid values are none, short, full, html`)
 	expectedPanicStrategyError := errors.New(`invalid panic option 'incorrect': valid values are print, trap`)
 
diff --git a/src/device/riscv/start.S b/src/device/riscv/start.S
index 25217b3579..d67d82dc5a 100644
--- a/src/device/riscv/start.S
+++ b/src/device/riscv/start.S
@@ -3,8 +3,47 @@
 .type _start,@function
 
 _start:
+    // If we're on a multicore system, we need to wait for hart 0 to wake us up.
+#if TINYGO_CORES > 1
+    csrr a0, mhartid
+
+    // Hart 0 stack
+    bnez a0, 1f
+    la sp,      _stack_top
+
+1:
+    // Hart 1 stack
+    li a1, 1
+    bne a0, a1, 2f
+    la sp,      _stack1_top
+
+2:
+    // Hart 2 stack
+    #if TINYGO_CORES >= 3
+    li a1, 2
+    bne a0, a1, 3f
+    la sp,      _stack2_top
+    #endif
+
+3:
+    // Hart 3 stack
+    #if TINYGO_CORES >= 4
+    li a1, 3
+    bne a0, a1, 4f
+    la sp,      _stack3_top
+    #endif
+
+4:
+    // done
+
+#if TINYGO_CORES > 4
+#error only up to 4 cores are supported at the moment!
+#endif
+
+#else
     // Load the stack pointer.
     la sp,      _stack_top
+#endif
 
     // Load the globals pointer. The program will load pointers relative to this
     // register, so it must be set to the right value on startup.
diff --git a/src/internal/task/atomic-cooperative.go b/src/internal/task/atomic-cooperative.go
index bd4cba8956..e05ea7de0d 100644
--- a/src/internal/task/atomic-cooperative.go
+++ b/src/internal/task/atomic-cooperative.go
@@ -1,4 +1,4 @@
-//go:build !scheduler.threads
+//go:build tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/atomic-preemptive.go b/src/internal/task/atomic-preemptive.go
index 275f36dce4..b395ef48a3 100644
--- a/src/internal/task/atomic-preemptive.go
+++ b/src/internal/task/atomic-preemptive.go
@@ -1,4 +1,4 @@
-//go:build scheduler.threads
+//go:build !tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/futex-cooperative.go b/src/internal/task/futex-cooperative.go
index 2a42c28d43..ae9efb5a73 100644
--- a/src/internal/task/futex-cooperative.go
+++ b/src/internal/task/futex-cooperative.go
@@ -1,4 +1,4 @@
-//go:build !scheduler.threads
+//go:build tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/futex-cores.go b/src/internal/task/futex-cores.go
new file mode 100644
index 0000000000..9bf493f25c
--- /dev/null
+++ b/src/internal/task/futex-cores.go
@@ -0,0 +1,64 @@
+//go:build scheduler.cores
+
+package task
+
+import "runtime/interrupt"
+
+// A futex is a way for userspace to wait with the pointer as the key, and for
+// another thread to wake one or all waiting threads keyed on the same pointer.
+//
+// A futex does not change the underlying value, it only reads it before to prevent
+// lost wake-ups.
+type Futex struct {
+	Uint32
+
+	waiters Stack
+}
+
+// Atomically check for cmp to still be equal to the futex value and if so, go
+// to sleep. Return true if we were definitely awoken by a call to Wake or
+// WakeAll, and false if we can't be sure of that.
+func (f *Futex) Wait(cmp uint32) (awoken bool) {
+	mask := lockFutex()
+
+	if f.Uint32.Load() != cmp {
+		unlockFutex(mask)
+		return false
+	}
+
+	// Push the current goroutine onto the waiter stack.
+	f.waiters.Push(Current())
+
+	unlockFutex(mask)
+
+	// Pause until this task is awoken by Wake/WakeAll.
+	Pause()
+
+	// We were awoken by a call to Wake or WakeAll. There is no chance for
+	// spurious wakeups.
+	return true
+}
+
+// Wake a single waiter.
+func (f *Futex) Wake() {
+	mask := lockFutex()
+	if t := f.waiters.Pop(); t != nil {
+		scheduleTask(t)
+	}
+	unlockFutex(mask)
+}
+
+// Wake all waiters.
+func (f *Futex) WakeAll() {
+	mask := lockFutex()
+	for t := f.waiters.Pop(); t != nil; t = f.waiters.Pop() {
+		scheduleTask(t)
+	}
+	unlockFutex(mask)
+}
+
+//go:linkname lockFutex runtime.lockFutex
+func lockFutex() interrupt.State
+
+//go:linkname unlockFutex runtime.unlockFutex
+func unlockFutex(interrupt.State)
diff --git a/src/internal/task/futex-preemptive.go b/src/internal/task/futex-threads.go
similarity index 100%
rename from src/internal/task/futex-preemptive.go
rename to src/internal/task/futex-threads.go
diff --git a/src/internal/task/mutex-cooperative.go b/src/internal/task/mutex-cooperative.go
index f1205eea25..90274df2bb 100644
--- a/src/internal/task/mutex-cooperative.go
+++ b/src/internal/task/mutex-cooperative.go
@@ -1,4 +1,4 @@
-//go:build !scheduler.threads
+//go:build tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/mutex-preemptive.go b/src/internal/task/mutex-preemptive.go
index 27f4646698..ec83a6135d 100644
--- a/src/internal/task/mutex-preemptive.go
+++ b/src/internal/task/mutex-preemptive.go
@@ -1,4 +1,4 @@
-//go:build scheduler.threads
+//go:build !tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/pmutex-cooperative.go b/src/internal/task/pmutex-cooperative.go
index 0e6c4f828b..b61e92d829 100644
--- a/src/internal/task/pmutex-cooperative.go
+++ b/src/internal/task/pmutex-cooperative.go
@@ -1,4 +1,4 @@
-//go:build !scheduler.threads
+//go:build tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/pmutex-preemptive.go b/src/internal/task/pmutex-preemptive.go
index 10f0a63561..92263ed256 100644
--- a/src/internal/task/pmutex-preemptive.go
+++ b/src/internal/task/pmutex-preemptive.go
@@ -1,4 +1,4 @@
-//go:build scheduler.threads
+//go:build !tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/task.go b/src/internal/task/task.go
index 58c02fe846..e257e1bc8e 100644
--- a/src/internal/task/task.go
+++ b/src/internal/task/task.go
@@ -24,11 +24,28 @@ type Task struct {
 	// This is needed for some crypto packages.
 	FipsIndicator uint8
 
+	// State of the goroutine: running, paused, or must-resume-next-pause.
+	// This extra field doesn't increase memory usage on 32-bit CPUs and above,
+	// since it falls into the padding of the FipsIndicator bit above.
+	RunState uint8
+
 	// DeferFrame stores a pointer to the (stack allocated) defer frame of the
 	// goroutine that is used for the recover builtin.
 	DeferFrame unsafe.Pointer
 }
 
+const (
+	// Initial state: the goroutine state is saved on the stack.
+	RunStatePaused = iota
+
+	// The goroutine is running right now.
+	RunStateRunning
+
+	// The goroutine is running, but already marked as "can resume".
+	// The next call to Pause() won't actually pause the goroutine.
+	RunStateResuming
+)
+
 // DataUint32 returns the Data field as a uint32. The value is only valid after
 // setting it through SetDataUint32 or by storing to it using DataAtomicUint32.
 func (t *Task) DataUint32() uint32 {
diff --git a/src/internal/task/task_stack.go b/src/internal/task/task_stack.go
index 74a0a8c7cc..b6c4a5df93 100644
--- a/src/internal/task/task_stack.go
+++ b/src/internal/task/task_stack.go
@@ -1,9 +1,8 @@
-//go:build scheduler.tasks
+//go:build scheduler.tasks || scheduler.cores
 
 package task
 
 import (
-	"runtime/interrupt"
 	"unsafe"
 )
 
@@ -32,44 +31,12 @@ type state struct {
 	canaryPtr *uintptr
 }
 
-// currentTask is the current running task, or nil if currently in the scheduler.
-var currentTask *Task
-
-// Current returns the current active task.
-func Current() *Task {
-	return currentTask
-}
-
-// Pause suspends the current task and returns to the scheduler.
-// This function may only be called when running on a goroutine stack, not when running on the system stack or in an interrupt.
-func Pause() {
-	// Check whether the canary (the lowest address of the stack) is still
-	// valid. If it is not, a stack overflow has occurred.
-	if *currentTask.state.canaryPtr != stackCanary {
-		runtimePanic("goroutine stack overflow")
-	}
-	if interrupt.In() {
-		runtimePanic("blocked inside interrupt")
-	}
-	currentTask.state.pause()
-}
-
 //export tinygo_task_exit
 func taskExit() {
 	// TODO: explicitly free the stack after switching back to the scheduler.
 	Pause()
 }
 
-// Resume the task until it pauses or completes.
-// This may only be called from the scheduler.
-func (t *Task) Resume() {
-	currentTask = t
-	t.gcData.swap()
-	t.state.resume()
-	t.gcData.swap()
-	currentTask = nil
-}
-
 // initialize the state and prepare to call the specified function with the specified argument bundle.
 func (s *state) initialize(fn uintptr, args unsafe.Pointer, stackSize uintptr) {
 	// Create a stack.
diff --git a/src/internal/task/task_stack_multicore.go b/src/internal/task/task_stack_multicore.go
new file mode 100644
index 0000000000..65cf3a004c
--- /dev/null
+++ b/src/internal/task/task_stack_multicore.go
@@ -0,0 +1,53 @@
+//go:build scheduler.cores
+
+package task
+
+import "runtime/interrupt"
+
+// Current returns the current active task.
+//
+//go:linkname Current runtime.currentTask
+func Current() *Task
+
+// Pause suspends the current task and returns to the scheduler.
+// This function may only be called when running on a goroutine stack, not when running on the system stack or in an interrupt.
+func Pause() {
+	lockScheduler()
+	PauseLocked()
+}
+
+// PauseLocked is the same as Pause, but must be called with the scheduler lock
+// already taken.
+func PauseLocked() {
+	// Check whether the canary (the lowest address of the stack) is still
+	// valid. If it is not, a stack overflow has occurred.
+	current := Current()
+	if *current.state.canaryPtr != stackCanary {
+		runtimePanic("goroutine stack overflow")
+	}
+	if interrupt.In() {
+		runtimePanic("blocked inside interrupt")
+	}
+	if current.RunState == RunStateResuming {
+		// Another core already marked this goroutine as ready to resume.
+		current.RunState = RunStateRunning
+		unlockScheduler()
+		return
+	}
+	current.RunState = RunStatePaused
+	current.state.pause()
+}
+
+// Resume the task until it pauses or completes.
+// This may only be called from the scheduler.
+func (t *Task) Resume() {
+	t.gcData.swap()
+	t.state.resume()
+	t.gcData.swap()
+}
+
+//go:linkname lockScheduler runtime.lockScheduler
+func lockScheduler()
+
+//go:linkname unlockScheduler runtime.unlockScheduler
+func unlockScheduler()
diff --git a/src/internal/task/task_stack_tinygoriscv.go b/src/internal/task/task_stack_tinygoriscv.go
index edf1215a08..541dc96a4c 100644
--- a/src/internal/task/task_stack_tinygoriscv.go
+++ b/src/internal/task/task_stack_tinygoriscv.go
@@ -1,10 +1,16 @@
-//go:build scheduler.tasks && tinygo.riscv
+//go:build (scheduler.tasks || scheduler.cores) && tinygo.riscv
 
 package task
 
 import "unsafe"
 
-var systemStack uintptr
+// Returns a pointer where the system stack can be stored.
+// This is a layering violation! We should probably refactor this so that we
+// don't need such gymnastics to store the system stack pointer. (It should
+// probably be moved to the runtime).
+//
+//go:linkname runtime_systemStackPtr runtime.systemStackPtr
+func runtime_systemStackPtr() *uintptr
 
 // calleeSavedRegs is the list of registers that must be saved and restored when
 // switching between tasks. Also see scheduler_riscv.S that relies on the
@@ -50,17 +56,18 @@ func (s *state) archInit(r *calleeSavedRegs, fn uintptr, args unsafe.Pointer) {
 }
 
 func (s *state) resume() {
-	swapTask(s.sp, &systemStack)
+	swapTask(s.sp, runtime_systemStackPtr())
 }
 
 func (s *state) pause() {
-	newStack := systemStack
-	systemStack = 0
+	systemStackPtr := runtime_systemStackPtr()
+	newStack := *systemStackPtr
+	*systemStackPtr = 0
 	swapTask(newStack, &s.sp)
 }
 
 // SystemStack returns the system stack pointer when called from a task stack.
 // When called from the system stack, it returns 0.
 func SystemStack() uintptr {
-	return systemStack
+	return *runtime_systemStackPtr()
 }
diff --git a/src/internal/task/task_stack_unicore.go b/src/internal/task/task_stack_unicore.go
new file mode 100644
index 0000000000..b4425de38f
--- /dev/null
+++ b/src/internal/task/task_stack_unicore.go
@@ -0,0 +1,37 @@
+//go:build scheduler.tasks
+
+package task
+
+import "runtime/interrupt"
+
+// currentTask is the current running task, or nil if currently in the scheduler.
+var currentTask *Task
+
+// Current returns the current active task.
+func Current() *Task {
+	return currentTask
+}
+
+// Pause suspends the current task and returns to the scheduler.
+// This function may only be called when running on a goroutine stack, not when running on the system stack or in an interrupt.
+func Pause() {
+	// Check whether the canary (the lowest address of the stack) is still
+	// valid. If it is not, a stack overflow has occurred.
+	if *currentTask.state.canaryPtr != stackCanary {
+		runtimePanic("goroutine stack overflow")
+	}
+	if interrupt.In() {
+		runtimePanic("blocked inside interrupt")
+	}
+	currentTask.state.pause()
+}
+
+// Resume the task until it pauses or completes.
+// This may only be called from the scheduler.
+func (t *Task) Resume() {
+	currentTask = t
+	t.gcData.swap()
+	t.state.resume()
+	t.gcData.swap()
+	currentTask = nil
+}
diff --git a/src/runtime/atomics_critical.go b/src/runtime/atomics_critical.go
index 2d98881a10..74ce321f10 100644
--- a/src/runtime/atomics_critical.go
+++ b/src/runtime/atomics_critical.go
@@ -6,7 +6,6 @@
 package runtime
 
 import (
-	"runtime/interrupt"
 	_ "unsafe"
 )
 
@@ -23,27 +22,27 @@ import (
 func __atomic_load_2(ptr *uint16, ordering uintptr) uint16 {
 	// The LLVM docs for this say that there is a val argument after the pointer.
 	// That is a typo, and the GCC docs omit it.
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	val := *ptr
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return val
 }
 
 //export __atomic_store_2
 func __atomic_store_2(ptr *uint16, val uint16, ordering uintptr) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	*ptr = val
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 }
 
 //go:inline
 func doAtomicCAS16(ptr *uint16, expected, desired uint16) uint16 {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	if old == expected {
 		*ptr = desired
 	}
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -61,10 +60,10 @@ func __atomic_compare_exchange_2(ptr, expected *uint16, desired uint16, successO
 
 //go:inline
 func doAtomicSwap16(ptr *uint16, new uint16) uint16 {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -80,11 +79,11 @@ func __atomic_exchange_2(ptr *uint16, new uint16, ordering uintptr) uint16 {
 
 //go:inline
 func doAtomicAdd16(ptr *uint16, value uint16) (old, new uint16) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old = *ptr
 	new = old + value
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old, new
 }
 
@@ -112,27 +111,27 @@ func __atomic_add_fetch_2(ptr *uint16, value uint16, ordering uintptr) uint16 {
 func __atomic_load_4(ptr *uint32, ordering uintptr) uint32 {
 	// The LLVM docs for this say that there is a val argument after the pointer.
 	// That is a typo, and the GCC docs omit it.
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	val := *ptr
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return val
 }
 
 //export __atomic_store_4
 func __atomic_store_4(ptr *uint32, val uint32, ordering uintptr) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	*ptr = val
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 }
 
 //go:inline
 func doAtomicCAS32(ptr *uint32, expected, desired uint32) uint32 {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	if old == expected {
 		*ptr = desired
 	}
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -150,10 +149,10 @@ func __atomic_compare_exchange_4(ptr, expected *uint32, desired uint32, successO
 
 //go:inline
 func doAtomicSwap32(ptr *uint32, new uint32) uint32 {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -169,11 +168,11 @@ func __atomic_exchange_4(ptr *uint32, new uint32, ordering uintptr) uint32 {
 
 //go:inline
 func doAtomicAdd32(ptr *uint32, value uint32) (old, new uint32) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old = *ptr
 	new = old + value
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old, new
 }
 
@@ -201,27 +200,27 @@ func __atomic_add_fetch_4(ptr *uint32, value uint32, ordering uintptr) uint32 {
 func __atomic_load_8(ptr *uint64, ordering uintptr) uint64 {
 	// The LLVM docs for this say that there is a val argument after the pointer.
 	// That is a typo, and the GCC docs omit it.
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	val := *ptr
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return val
 }
 
 //export __atomic_store_8
 func __atomic_store_8(ptr *uint64, val uint64, ordering uintptr) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	*ptr = val
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 }
 
 //go:inline
 func doAtomicCAS64(ptr *uint64, expected, desired uint64) uint64 {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	if old == expected {
 		*ptr = desired
 	}
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -239,10 +238,10 @@ func __atomic_compare_exchange_8(ptr, expected *uint64, desired uint64, successO
 
 //go:inline
 func doAtomicSwap64(ptr *uint64, new uint64) uint64 {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -258,11 +257,11 @@ func __atomic_exchange_8(ptr *uint64, new uint64, ordering uintptr) uint64 {
 
 //go:inline
 func doAtomicAdd64(ptr *uint64, value uint64) (old, new uint64) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old = *ptr
 	new = old + value
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old, new
 }
 
diff --git a/src/runtime/gc_stack_cores.go b/src/runtime/gc_stack_cores.go
new file mode 100644
index 0000000000..0c1e0c621b
--- /dev/null
+++ b/src/runtime/gc_stack_cores.go
@@ -0,0 +1,11 @@
+//go:build scheduler.cores
+
+package runtime
+
+func gcMarkReachable() {
+	runtimePanic("todo: gcMarkReachable")
+}
+
+func gcResumeWorld() {
+	// TODO
+}
diff --git a/src/runtime/gc_stack_raw.go b/src/runtime/gc_stack_raw.go
index 94cb5e43b2..4ff25dcec8 100644
--- a/src/runtime/gc_stack_raw.go
+++ b/src/runtime/gc_stack_raw.go
@@ -1,4 +1,4 @@
-//go:build (gc.conservative || gc.precise || gc.boehm) && !tinygo.wasm && !scheduler.threads
+//go:build (gc.conservative || gc.precise || gc.boehm) && !tinygo.wasm && !scheduler.threads && !scheduler.cores
 
 package runtime
 
diff --git a/src/runtime/print.go b/src/runtime/print.go
index a4de460253..a5fba0c8d0 100644
--- a/src/runtime/print.go
+++ b/src/runtime/print.go
@@ -1,7 +1,6 @@
 package runtime
 
 import (
-	"internal/task"
 	"unsafe"
 )
 
@@ -9,18 +8,6 @@ type stringer interface {
 	String() string
 }
 
-// Lock to make sure print calls do not interleave.
-// This is a no-op lock on systems that do not have parallelism.
-var printLock task.PMutex
-
-func printlock() {
-	printLock.Lock()
-}
-
-func printunlock() {
-	printLock.Unlock()
-}
-
 //go:nobounds
 func printstring(s string) {
 	for i := 0; i < len(s); i++ {
diff --git a/src/runtime/runtime_tinygoriscv_qemu.go b/src/runtime/runtime_tinygoriscv_qemu.go
index a77ad71f55..57b989fc32 100644
--- a/src/runtime/runtime_tinygoriscv_qemu.go
+++ b/src/runtime/runtime_tinygoriscv_qemu.go
@@ -4,26 +4,77 @@ package runtime
 
 import (
 	"device/riscv"
+	"math/bits"
 	"runtime/volatile"
+	"sync/atomic"
 	"unsafe"
 )
 
 // This file implements the VirtIO RISC-V interface implemented in QEMU, which
 // is an interface designed for emulation.
 
+const numCPU = 4
+
 //export main
 func main() {
-	preinit()
-
 	// Set the interrupt address.
 	// Note that this address must be aligned specially, otherwise the MODE bits
 	// of MTVEC won't be zero.
 	riscv.MTVEC.Set(uintptr(unsafe.Pointer(&handleInterruptASM)))
 
+	// Enable software interrupts. We'll need them to wake up other cores.
+	riscv.MIE.SetBits(riscv.MIE_MSIE)
+
+	// If we're not hart 0, wait until we get the signal everything has been set
+	// up.
+	if hartID := riscv.MHARTID.Get(); hartID != 0 {
+		// Wait until we get the signal this hart is ready to start.
+		// Note that interrupts are disabled, which means that the interrupt
+		// isn't actually taken. But we can still wait for it using wfi.
+		for riscv.MIP.Get()&riscv.MIP_MSIP == 0 {
+			riscv.Asm("wfi")
+		}
+
+		// Clear the software interrupt.
+		aclintMSWI.MSIP[hartID].Set(0)
+
+		// Now that we've cleared the software interrupt, we can enable
+		// interrupts as was already done on hart 0.
+		riscv.MSTATUS.SetBits(riscv.MSTATUS_MIE)
+
+		// Now start running the scheduler on this core.
+		schedulerLock.Lock()
+		scheduler(false)
+
+		// The scheduler exited, which means main returned and the program
+		// should exit.
+		// Make sure hart 0 is woken (it might be asleep at the moment).
+		if sleepingHarts&0b1 != 0 {
+			// Hart 0 is currently sleeping, wake it up.
+			sleepingHarts &^= 0b1 // clear the bit
+			aclintMSWI.MSIP[0].Set(1)
+		}
+
+		// Make sure hart 0 can actually enter the scheduler (since we still
+		// have the scheduler lock) to realize the program has exited.
+		schedulerLock.Unlock()
+
+		// Now wait until the program exits. This shouldn't take very long.
+		for {
+			riscv.Asm("wfi")
+		}
+	}
+
 	// Enable global interrupts now that they've been set up.
 	// This is currently only for timer interrupts.
 	riscv.MSTATUS.SetBits(riscv.MSTATUS_MIE)
 
+	// Set all MTIMECMP registers to a value that clears the MTIP bit in MIP.
+	// If we don't do this, the wfi instruction won't work as expected.
+	for i := 0; i < numCPU; i++ {
+		aclintMTIMECMP[i].Set(0xffff_ffff_ffff_ffff)
+	}
+
 	run()
 	exit(0)
 }
@@ -38,12 +89,13 @@ func handleInterrupt() {
 	if cause&(1<<31) != 0 {
 		// Topmost bit is set, which means that it is an interrupt.
 		switch code {
-		case riscv.MachineTimerInterrupt:
-			// Signal timeout.
-			timerWakeup.Set(1)
-			// Disable the timer, to avoid triggering the interrupt right after
-			// this interrupt returns.
-			riscv.MIE.ClearBits(riscv.MIE_MTIE)
+		// Note: software and timer interrupts are handled by disabling
+		// interrupts and waiting for the corresponding bit in MIP to change.
+		// (This is to avoid TOCTOU issues between checking for a flag and the
+		// wfi instruction).
+		default:
+			print("fatal error: unknown interrupt")
+			abort()
 		}
 	} else {
 		// Topmost bit is clear, so it is an exception of some sort.
@@ -67,23 +119,77 @@ func nanosecondsToTicks(ns int64) timeUnit {
 	return timeUnit(ns / 100) // one tick is 100ns
 }
 
-var timerWakeup volatile.Register8
-
 func sleepTicks(d timeUnit) {
-	// Enable the timer.
+	// Disable all interrupts.
+	riscv.MSTATUS.ClearBits(riscv.MSTATUS_MIE)
+
+	// Configure timeout.
 	target := uint64(ticks() + d)
-	aclintMTIMECMP.Set(target)
+	hartID := riscv.MHARTID.Get()
+	aclintMTIMECMP[hartID].Set(target)
+
+	// Wait until the timeout is hit.
 	riscv.MIE.SetBits(riscv.MIE_MTIE)
+	for riscv.MIP.Get()&riscv.MIP_MTIP == 0 {
+		riscv.Asm("wfi")
+	}
+	riscv.MIE.ClearBits(riscv.MIE_MTIE)
 
-	// Wait until it fires.
-	for {
-		if timerWakeup.Get() != 0 {
-			timerWakeup.Set(0)
-			// Disable timer.
-			break
-		}
+	// Set MTIMECMP to a high value so that MTIP goes low.
+	aclintMTIMECMP[hartID].Set(0xffff_ffff_ffff_ffff)
+
+	// Re-enable all interrupts.
+	riscv.MSTATUS.SetBits(riscv.MSTATUS_MIE)
+}
+
+// Currently sleeping core, or 0xff.
+// Must only be accessed with the scheduler lock held.
+var sleepingCore uint8 = 0xff
+
+// Almost identical to sleepTicks, except that it will unlock/lock the scheduler
+// and is interruptible by exitSleepTicksMulticore.
+func sleepTicksMulticore(d timeUnit) {
+	// Disable all interrupts.
+	riscv.MSTATUS.ClearBits(riscv.MSTATUS_MIE)
+
+	// Configure timeout.
+	target := uint64(ticks() + d)
+	hartID := riscv.MHARTID.Get()
+	aclintMTIMECMP[hartID].Set(target)
+	sleepingCore = uint8(hartID)
+
+	// Unlock, now that the timeout has been set (so that
+	// exitSleepTicksMulticore will see the correct wakeup time).
+	schedulerLock.Unlock()
+
+	// Wait until the timeout is hit.
+	riscv.MIE.SetBits(riscv.MIE_MTIE)
+	for riscv.MIP.Get()&riscv.MIP_MTIP == 0 {
 		riscv.Asm("wfi")
 	}
+	riscv.MIE.ClearBits(riscv.MIE_MTIE)
+
+	// Lock again, after we finished sleeping.
+	schedulerLock.Lock()
+
+	// Set MTIMECMP to a high value so that MTIP goes low.
+	aclintMTIMECMP[hartID].Set(0xffff_ffff_ffff_ffff)
+	sleepingCore = 0xff
+
+	// Re-enable all interrupts.
+	riscv.MSTATUS.SetBits(riscv.MSTATUS_MIE)
+}
+
+// Interrupt an ongoing call to sleepTicksMulticore on another core.
+// This may only be called with the scheduler lock held.
+func interruptSleepTicksMulticore(wakeup timeUnit) {
+	if sleepingCore != 0xff {
+		// Immediately exit the sleep.
+		old := aclintMTIMECMP[sleepingCore].Get()
+		if uint64(wakeup) < old {
+			aclintMTIMECMP[sleepingCore].Set(uint64(wakeup))
+		}
+	}
 }
 
 func ticks() timeUnit {
@@ -98,7 +204,7 @@ func ticks() timeUnit {
 			return timeUnit(lowBits) | (timeUnit(highBits) << 32)
 		}
 		// Retry, because there was a rollover in the low bits (happening every
-		// 429 days).
+		// ~7 days).
 		highBits = newHighBits
 	}
 }
@@ -120,7 +226,10 @@ var (
 		low  volatile.Register32
 		high volatile.Register32
 	})(unsafe.Pointer(uintptr(0x0200_bff8)))
-	aclintMTIMECMP = (*volatile.Register64)(unsafe.Pointer(uintptr(0x0200_4000)))
+	aclintMTIMECMP = (*[4095]volatile.Register64)(unsafe.Pointer(uintptr(0x0200_4000)))
+	aclintMSWI     = (*struct {
+		MSIP [4095]volatile.Register32
+	})(unsafe.Pointer(uintptr(0x0200_0000)))
 )
 
 func putchar(c byte) {
@@ -137,6 +246,92 @@ func buffered() int {
 	return 0
 }
 
+// Define the various spinlocks needed by the runtime.
+var (
+	schedulerLock spinLock
+	futexLock     spinLock
+	atomicsLock   spinLock
+	printLock     spinLock
+)
+
+type spinLock struct {
+	atomic.Uint32
+}
+
+func (l *spinLock) Lock() {
+	// Try to replace 0 with 1. Once we succeed, the lock has been acquired.
+	for !l.Uint32.CompareAndSwap(0, 1) {
+		// Hint to the CPU that this core is just waiting, and the core can go
+		// into a lower energy state.
+		// This is a no-op in QEMU TCG (but added here for completeness):
+		// https://github.com/qemu/qemu/blob/v9.2.3/target/riscv/insn_trans/trans_rvi.c.inc#L856
+		riscv.Asm("pause")
+	}
+}
+
+func (l *spinLock) Unlock() {
+	// Safety check: the spinlock should have been locked.
+	if schedulerAsserts && l.Uint32.Load() != 1 {
+		runtimePanic("unlock of unlocked spinlock")
+	}
+
+	// Unlock the lock. Simply write 0, because we already know it is locked.
+	l.Uint32.Store(0)
+}
+
+func currentCPU() uint32 {
+	return uint32(riscv.MHARTID.Get())
+}
+
+func startSecondaryCores() {
+	// Start all the other cores besides hart 0.
+	for hart := 1; hart < numCPU; hart++ {
+		// Signal the given hart it is ready to start using a software
+		// interrupt.
+		aclintMSWI.MSIP[hart].Set(1)
+	}
+}
+
+// Bitset of harts that are currently sleeping in schedulerUnlockAndWait.
+// This supports up to 8 harts.
+// This variable may only be accessed with the scheduler lock held.
+var sleepingHarts uint8
+
+// Put the scheduler to sleep, since there are no tasks to run.
+// This will unlock the scheduler lock, and must be called with the scheduler
+// lock held.
+func schedulerUnlockAndWait() {
+	hartID := riscv.MHARTID.Get()
+
+	// Mark the current hart as sleeping.
+	sleepingHarts |= uint8(1 << hartID)
+
+	// Wait for a software interrupt, with interrupts disabled and the scheduler
+	// unlocked.
+	riscv.MSTATUS.ClearBits(riscv.MSTATUS_MIE)
+	schedulerLock.Unlock()
+	for riscv.MIP.Get()&riscv.MIP_MSIP == 0 {
+		riscv.Asm("wfi")
+	}
+	aclintMSWI.MSIP[hartID].Set(0)
+	schedulerLock.Lock()
+	riscv.MSTATUS.SetBits(riscv.MSTATUS_MIE)
+}
+
+// Wake another core, if one is sleeping. Must be called with the scheduler lock
+// held.
+func schedulerWake() {
+	// Look up the lowest-numbered hart that is sleeping.
+	// Returns 8 if there are no sleeping harts.
+	hart := bits.TrailingZeros8(sleepingHarts)
+
+	if hart < 8 {
+		// There is a sleeping hart. Wake it.
+		sleepingHarts &^= 1 << hart  // clear the bit
+		aclintMSWI.MSIP[hart].Set(1) // send software interrupt
+	}
+}
+
 func abort() {
 	exit(1)
 }
diff --git a/src/runtime/scheduler_cooperative.go b/src/runtime/scheduler_cooperative.go
index 5f569c6e14..bf6f5aec49 100644
--- a/src/runtime/scheduler_cooperative.go
+++ b/src/runtime/scheduler_cooperative.go
@@ -252,3 +252,19 @@ func run() {
 	}()
 	scheduler(false)
 }
+
+func lockAtomics() interrupt.State {
+	return interrupt.Disable()
+}
+
+func unlockAtomics(mask interrupt.State) {
+	interrupt.Restore(mask)
+}
+
+func printlock() {
+	// nothing to do
+}
+
+func printunlock() {
+	// nothing to do
+}
diff --git a/src/runtime/scheduler_cores.go b/src/runtime/scheduler_cores.go
new file mode 100644
index 0000000000..1724aaaea4
--- /dev/null
+++ b/src/runtime/scheduler_cores.go
@@ -0,0 +1,314 @@
+//go:build scheduler.cores
+
+package runtime
+
+import (
+	"internal/task"
+	"runtime/interrupt"
+	"sync/atomic"
+)
+
+const hasScheduler = true
+
+const hasParallelism = true
+
+var mainExited atomic.Uint32
+
+// Which task is running on a given core (or nil if there is no task running on
+// the core).
+var cpuTasks [numCPU]*task.Task
+
+var (
+	sleepQueue *task.Task
+	runQueue   *task.Task
+)
+
+func deadlock() {
+	// Call yield without requesting a wakeup.
+	task.Pause()
+	trap()
+}
+
+// Mark the given task as ready to resume.
+// This is allowed even if the task isn't paused yet, but will pause soon.
+func scheduleTask(t *task.Task) {
+	schedulerLock.Lock()
+	switch t.RunState {
+	case task.RunStatePaused:
+		// Paused, state is saved on the stack.
+		// Add it to the runqueue...
+		addToRunQueue(t)
+		// ...and wake up a sleeping core, if there is one.
+		// (If all cores are already busy, this is a no-op).
+		schedulerWake()
+	case task.RunStateRunning:
+		// Not yet paused (probably going to pause very soon), so let the
+		// Pause() function know it can resume immediately.
+		t.RunState = task.RunStateResuming
+	default:
+		if schedulerAsserts {
+			runtimePanic("scheduler: unknown run state")
+		}
+	}
+	schedulerLock.Unlock()
+}
+
+// Add task to runQueue.
+// Scheduler lock must be held when calling this function.
+func addToRunQueue(t *task.Task) {
+	t.Next = runQueue
+	runQueue = t
+}
+
+func addSleepTask(t *task.Task, wakeup timeUnit) {
+	// Save the timestamp when the task should be woken up.
+	t.Data = uint64(wakeup)
+
+	// If another core is currently using the timer, make sure it wakes up at
+	// the right time.
+	interruptSleepTicksMulticore(wakeup)
+
+	// Find the position where we should insert this task in the queue.
+	q := &sleepQueue
+	for {
+		if *q == nil {
+			// Found the end of the time queue. Insert it here, at the end.
+			break
+		}
+		if timeUnit((*q).Data) > timeUnit(t.Data) {
+			// Found a task in the queue that has a timeout before the
+			// to-be-sleeping task. Insert our task right before.
+			break
+		}
+		q = &(*q).Next
+	}
+
+	// Insert the task into the queue (this could be at the end, if *q is nil).
+	t.Next = *q
+	*q = t
+}
+
+func Gosched() {
+	addToRunQueue(task.Current())
+	task.Pause()
+}
+
+func addTimer(tn *timerNode) {
+	schedulerLock.Lock()
+	timerQueueAdd(tn)
+	interruptSleepTicksMulticore(tn.whenTicks())
+	schedulerLock.Unlock()
+}
+
+func removeTimer(t *timer) *timerNode {
+	schedulerLock.Lock()
+	n := timerQueueRemove(t)
+	schedulerLock.Unlock()
+	return n
+}
+
+func schedulerRunQueue() *task.Queue {
+	// This should not be reachable with the cores scheduler.
+	runtimePanic("unimplemented: schedulerRunQueue")
+	return nil
+}
+
+// Pause the current task for a given time.
+//
+//go:linkname sleep time.Sleep
+func sleep(duration int64) {
+	if duration <= 0 {
+		return
+	}
+
+	wakeup := ticks() + nanosecondsToTicks(duration)
+
+	// While the scheduler is locked:
+	// - add this task to the sleep queue
+	// - switch to the scheduler (only allowed while locked)
+	// - let the scheduler handle it from there
+	schedulerLock.Lock()
+	addSleepTask(task.Current(), wakeup)
+	task.PauseLocked()
+}
+
+// This function is called on the first core in the system. It will wake up the
+// other cores when ready.
+func run() {
+	initHeap()
+
+	go func() {
+		// Package initializers are currently run single-threaded.
+		// This might help with registering interrupts and such.
+		initAll()
+
+		// After package initializers have finished, start all the other cores.
+		startSecondaryCores()
+
+		// Run main.main.
+		callMain()
+
+		// main.main has exited, so the program should exit.
+		mainExited.Store(1)
+	}()
+
+	// The scheduler must always be entered while the scheduler lock is taken.
+	schedulerLock.Lock()
+	scheduler(false)
+	schedulerLock.Unlock()
+}
+
+var schedulerIsRunning = false
+
+func scheduler(_ bool) {
+	for mainExited.Load() == 0 {
+		// Check for ready-to-run tasks.
+		if runnable := runQueue; runnable != nil {
+			// Pop off the run queue.
+			runQueue = runnable.Next
+			runnable.Next = nil
+
+			// Resume it now.
+			setCurrentTask(runnable)
+			runnable.RunState = task.RunStateRunning
+			schedulerLock.Unlock() // unlock before resuming, Pause() will lock again
+			runnable.Resume()
+			setCurrentTask(nil)
+
+			continue
+		}
+
+		var now timeUnit
+		if sleepQueue != nil || timerQueue != nil {
+			now = ticks()
+
+			// Check whether the first task in the sleep queue is ready to run.
+			if sleepingTask := sleepQueue; sleepingTask != nil && now >= timeUnit(sleepingTask.Data) {
+				// It is, pop it from the queue.
+				sleepQueue = sleepQueue.Next
+				sleepingTask.Next = nil
+
+				// Run it now.
+				setCurrentTask(sleepingTask)
+				sleepingTask.RunState = task.RunStateRunning
+				schedulerLock.Unlock() // unlock before resuming, Pause() will lock again
+				sleepingTask.Resume()
+				setCurrentTask(nil)
+				continue
+			}
+
+			// Check whether a timer has expired that needs to be run.
+			if timerQueue != nil && now >= timerQueue.whenTicks() {
+				delay := ticksToNanoseconds(now - timerQueue.whenTicks())
+				// Pop timer from queue.
+				tn := timerQueue
+				timerQueue = tn.next
+				tn.next = nil
+
+				// Run the callback stored in this timer node.
+				schedulerLock.Unlock()
+				tn.callback(tn, delay)
+				schedulerLock.Lock()
+				continue
+			}
+		}
+
+		// At this point, there are no runnable tasks anymore.
+		// If another core is using the clock, let it handle the sleep queue.
+		if schedulerIsRunning {
+			schedulerUnlockAndWait()
+			continue
+		}
+
+		// The timer is free to use, so check whether there are any future
+		// tasks/timers that we can wait for.
+		var timeLeft timeUnit
+		if sleepingTask := sleepQueue; sleepingTask != nil {
+			// We already checked that there is no ready-to-run sleeping task
+			// (using the same 'now' value), so timeLeft will always be
+			// positive.
+			timeLeft = timeUnit(sleepingTask.Data) - now
+		}
+		if timerQueue != nil {
+			// If the timer queue needs to run earlier, reduce the time we are
+			// going to sleep.
+			// Like with sleepQueue, we already know there is no timer ready to
+			// run since we already checked above.
+			timeLeftForTimer := timerQueue.whenTicks() - now
+			if sleepQueue == nil || timeLeftForTimer < timeLeft {
+				timeLeft = timeLeftForTimer
+			}
+		}
+
+		if timeLeft > 0 {
+			// Sleep for a bit until the next task or timer is ready to run.
+			schedulerIsRunning = true
+			sleepTicksMulticore(timeLeft)
+			schedulerIsRunning = false
+			continue
+		}
+
+		// No runnable tasks and no sleeping tasks or timers. There's nothing to
+		// do.
+		// Wait until something happens (like an interrupt).
+		schedulerUnlockAndWait()
+	}
+}
+
+func currentTask() *task.Task {
+	return cpuTasks[currentCPU()]
+}
+
+func setCurrentTask(task *task.Task) {
+	cpuTasks[currentCPU()] = task
+}
+
+func lockScheduler() {
+	schedulerLock.Lock()
+}
+
+func unlockScheduler() {
+	schedulerLock.Unlock()
+}
+
+func lockFutex() interrupt.State {
+	mask := interrupt.Disable()
+	futexLock.Lock()
+	return mask
+}
+
+func unlockFutex(state interrupt.State) {
+	futexLock.Unlock()
+	interrupt.Restore(state)
+}
+
+// Use a single spinlock for atomics. This works fine, since atomics are very
+// short sequences of instructions.
+func lockAtomics() interrupt.State {
+	mask := interrupt.Disable()
+	atomicsLock.Lock()
+	return mask
+}
+
+func unlockAtomics(mask interrupt.State) {
+	atomicsLock.Unlock()
+	interrupt.Restore(mask)
+}
+
+var systemStack [numCPU]uintptr
+
+// Implementation detail of the internal/task package.
+// It needs to store the system stack pointer somewhere, and needs to know how
+// many cores there are to do so. But it doesn't know the number of cores. Hence
+// why this is implemented in the runtime.
+func systemStackPtr() *uintptr {
+	return &systemStack[currentCPU()]
+}
+
+func printlock() {
+	printLock.Lock()
+}
+
+func printunlock() {
+	printLock.Unlock()
+}
diff --git a/src/runtime/scheduler_none.go b/src/runtime/scheduler_none.go
index 3f88e03ebf..9e654d1f50 100644
--- a/src/runtime/scheduler_none.go
+++ b/src/runtime/scheduler_none.go
@@ -2,7 +2,10 @@
 
 package runtime
 
-import "internal/task"
+import (
+	"internal/task"
+	"runtime/interrupt"
+)
 
 const hasScheduler = false
 
@@ -79,3 +82,11 @@ func scheduler(returnAtDeadlock bool) {
 func getSystemStackPointer() uintptr {
 	return getCurrentStackPointer()
 }
+
+func lockAtomics() interrupt.State {
+	return interrupt.Disable()
+}
+
+func unlockAtomics(mask interrupt.State) {
+	interrupt.Restore(mask)
+}
diff --git a/src/runtime/scheduler_tasks.go b/src/runtime/scheduler_tasks.go
index 6ee540fd35..4b6e1025e8 100644
--- a/src/runtime/scheduler_tasks.go
+++ b/src/runtime/scheduler_tasks.go
@@ -15,3 +15,13 @@ func getSystemStackPointer() uintptr {
 	}
 	return sp
 }
+
+var systemStack uintptr
+
+// Implementation detail of the internal/task package.
+// It needs to store the system stack pointer somewhere, and needs to know how
+// many cores there are to do so. But it doesn't know the number of cores. Hence
+// why this is implemented in the runtime.
+func systemStackPtr() *uintptr {
+	return &systemStack
+}
diff --git a/src/runtime/scheduler_threads.go b/src/runtime/scheduler_threads.go
index 32b9caaf21..38a78baaae 100644
--- a/src/runtime/scheduler_threads.go
+++ b/src/runtime/scheduler_threads.go
@@ -127,3 +127,14 @@ func runqueueForGC() *task.Queue {
 	// There is only a runqueue when using the cooperative scheduler.
 	return nil
 }
+
+// Lock to make sure print calls do not interleave.
+var printLock task.Mutex
+
+func printlock() {
+	printLock.Lock()
+}
+
+func printunlock() {
+	printLock.Unlock()
+}
diff --git a/targets/riscv-qemu.json b/targets/riscv-qemu.json
index 8a85cf9a72..e8ccc76195 100644
--- a/targets/riscv-qemu.json
+++ b/targets/riscv-qemu.json
@@ -1,8 +1,13 @@
 {
 	"inherits": ["riscv32"],
-	"features": "+32bit,+a,+c,+m,+zmmul,-b,-d,-e,-experimental-smmpm,-experimental-smnpm,-experimental-ssnpm,-experimental-sspm,-experimental-ssqosid,-experimental-supm,-experimental-zacas,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-f,-h,-relax,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-v,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xesppie,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-za64rs,-zaamo,-zabha,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zicond,-zicsr,-zifencei,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-ztso,-zvbb,-zvbc,-zve32f,-zve32x,-zve64d,-zve64f,-zve64x,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl128b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl32b,-zvl4096b,-zvl512b,-zvl64b,-zvl65536b,-zvl8192b",
+	"features": "+32bit,+a,+c,+m,+zihintpause,+zmmul,-b,-d,-e,-experimental-smmpm,-experimental-smnpm,-experimental-ssnpm,-experimental-sspm,-experimental-ssqosid,-experimental-supm,-experimental-zacas,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-f,-h,-relax,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-v,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xesppie,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-za64rs,+zaamo,-zabha,+zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zicond,-zicsr,-zifencei,-zihintntl,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-ztso,-zvbb,-zvbc,-zve32f,-zve32x,-zve64d,-zve64f,-zve64x,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl128b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl32b,-zvl4096b,-zvl512b,-zvl64b,-zvl65536b,-zvl8192b",
 	"build-tags": ["virt", "qemu"],
+	"scheduler": "cores",
 	"default-stack-size": 8192,
+	"cflags": [
+		"-march=rv32imaczihintpause",
+		"-DTINYGO_CORES=4"
+	],
 	"linkerscript": "targets/riscv-qemu.ld",
-	"emulator": "qemu-system-riscv32 -machine virt,aclint=on -nographic -bios none -device virtio-rng-device -kernel {}"
+	"emulator": "qemu-system-riscv32 -machine virt,aclint=on -smp 4 -nographic -bios none -device virtio-rng-device -kernel {}"
 }
diff --git a/targets/riscv.ld b/targets/riscv.ld
index eecac6b476..63748b1349 100644
--- a/targets/riscv.ld
+++ b/targets/riscv.ld
@@ -16,13 +16,34 @@ SECTIONS
     /* Put the stack at the bottom of RAM, so that the application will
      * crash on stack overflow instead of silently corrupting memory.
      * See: http://blog.japaric.io/stack-overflow-protection/ */
-    .stack (NOLOAD) :
+    .stack0 (NOLOAD) :
     {
         . = ALIGN(16);
         . += _stack_size;
         _stack_top = .;
     } >RAM
 
+    .stack1 (NOLOAD) :
+    {
+        . = ALIGN(16);
+        . += _stack_size;
+        _stack1_top = .;
+    } >RAM
+
+    .stack2 (NOLOAD) :
+    {
+        . = ALIGN(16);
+        . += _stack_size;
+        _stack2_top = .;
+    } >RAM
+
+    .stack3 (NOLOAD) :
+    {
+        . = ALIGN(16);
+        . += _stack_size;
+        _stack3_top = .;
+    } >RAM
+
     /* Start address (in flash) of .data, used by startup code. */
     _sidata = LOADADDR(.data);
 
diff --git a/tools/gen-critical-atomics/gen-critical-atomics.go b/tools/gen-critical-atomics/gen-critical-atomics.go
index 75ea327076..98ceebb020 100644
--- a/tools/gen-critical-atomics/gen-critical-atomics.go
+++ b/tools/gen-critical-atomics/gen-critical-atomics.go
@@ -26,7 +26,6 @@ package runtime
 
 import (
 	_ "unsafe"
-	"runtime/interrupt"
 )
 
 // Documentation:
@@ -41,29 +40,29 @@ import (
 func __atomic_load_{{.}}(ptr *uint{{$bits}}, ordering uintptr) uint{{$bits}} {
 	// The LLVM docs for this say that there is a val argument after the pointer.
 	// That is a typo, and the GCC docs omit it.
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	val := *ptr
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return val
 }
 {{end}}
 {{- define "store"}}{{$bits := mul . 8 -}}
 //export __atomic_store_{{.}}
 func __atomic_store_{{.}}(ptr *uint{{$bits}}, val uint{{$bits}}, ordering uintptr) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	*ptr = val
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 }
 {{end}}
 {{- define "cas"}}{{$bits := mul . 8 -}}
 //go:inline
 func doAtomicCAS{{$bits}}(ptr *uint{{$bits}}, expected, desired uint{{$bits}}) uint{{$bits}} {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	if old == expected {
 		*ptr = desired
 	}
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -82,10 +81,10 @@ func __atomic_compare_exchange_{{.}}(ptr, expected *uint{{$bits}}, desired uint{
 {{- define "swap"}}{{$bits := mul . 8 -}}
 //go:inline
 func doAtomicSwap{{$bits}}(ptr *uint{{$bits}}, new uint{{$bits}}) uint{{$bits}} {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -111,11 +110,11 @@ func __atomic_exchange_{{.}}(ptr *uint{{$bits}}, new uint{{$bits}}, ordering uin
 
 //go:inline
 func {{$opfn}}(ptr *{{$type}}, value {{$type}}) (old, new {{$type}}) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old = *ptr
 	{{$opdef}}
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old, new
 }