Rollup merge of rust-lang#52051 - scottmcm:swap-directly, r=alexcrichton

kennytm · kennytm · commit b954d4d1b5a9 · 2018-07-23T01:02:41.000+08:00
mem::swap the obvious way for types smaller than the SIMD optimization's block size LLVM isn't able to remove the alloca for the unaligned block in the post-SIMD tail in some cases, so doing this helps SRoA work in cases where it currently doesn't. Found in the `replace_with` RFC discussion. Examples of the improvements: <details> <summary>swapping `[u16; 3]` takes 1/3 fewer instructions and no stackalloc</summary> ```rust type Demo = [u16; 3]; pub fn swap_demo(x: &mut Demo, y: &mut Demo) { std::mem::swap(x, y); } ``` nightly: ```asm _ZN4blah9swap_demo17ha1732a9b71393a7eE: .seh_proc _ZN4blah9swap_demo17ha1732a9b71393a7eE sub rsp, 32 .seh_stackalloc 32 .seh_endprologue movzx eax, word ptr [rcx + 4] mov word ptr [rsp + 4], ax mov eax, dword ptr [rcx] mov dword ptr [rsp], eax movzx eax, word ptr [rdx + 4] mov word ptr [rcx + 4], ax mov eax, dword ptr [rdx] mov dword ptr [rcx], eax movzx eax, word ptr [rsp + 4] mov word ptr [rdx + 4], ax mov eax, dword ptr [rsp] mov dword ptr [rdx], eax add rsp, 32 ret .seh_handlerdata .section .text,"xr",one_only,_ZN4blah9swap_demo17ha1732a9b71393a7eE .seh_endproc ``` this PR: ```asm _ZN4blah9swap_demo17ha1732a9b71393a7eE: mov r8d, dword ptr [rcx] movzx r9d, word ptr [rcx + 4] movzx eax, word ptr [rdx + 4] mov word ptr [rcx + 4], ax mov eax, dword ptr [rdx] mov dword ptr [rcx], eax mov word ptr [rdx + 4], r9w mov dword ptr [rdx], r8d ret ``` </details> <details> <summary>`replace_with` optimizes down much better</summary> Inspired by rust-lang/rfcs#2490, ```rust fn replace_with<T, F>(x: &mut Option<T>, f: F) where F: FnOnce(Option<T>) -> Option<T> { *x = f(x.take()); } pub fn inc_opt(mut x: &mut Option<i32>) { replace_with(&mut x, |i| i.map(|j| j + 1)); } ``` Rust 1.26.0: ```asm _ZN4blah7inc_opt17heb0acb64c51777cfE: mov rax, qword ptr [rcx] movabs r8, 4294967296 add r8, rax shl rax, 32 movabs rdx, -4294967296 and rdx, r8 xor r8d, r8d test rax, rax cmove rdx, rax setne r8b or rdx, r8 mov qword ptr [rcx], rdx ret ``` Nightly (better thanks to ScalarPair, maybe?): ```asm _ZN4blah7inc_opt17h66df690be0b5899dE: mov r8, qword ptr [rcx] mov rdx, r8 shr rdx, 32 xor eax, eax test r8d, r8d setne al add edx, 1 mov dword ptr [rcx], eax mov dword ptr [rcx + 4], edx ret ``` This PR: ```asm _ZN4blah7inc_opt17h1426dc215ecbdb19E: xor eax, eax cmp dword ptr [rcx], 0 setne al mov dword ptr [rcx], eax add dword ptr [rcx + 4], 1 ret ``` Where that add is beautiful -- using an addressing mode to not even need to explicitly go through a register -- and the remaining imperfection is well-known (rust-lang#49420 (comment)). </details>
diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
@@ -638,7 +638,7 @@ pub unsafe fn uninitialized<T>() -> T {
 #[stable(feature = "rust1", since = "1.0.0")]
 pub fn swap<T>(x: &mut T, y: &mut T) {
     unsafe {
-        ptr::swap_nonoverlapping(x, y, 1);
+        ptr::swap_nonoverlapping_one(x, y);
     }
 }
 
diff --git a/src/libcore/ptr.rs b/src/libcore/ptr.rs
@@ -187,6 +187,19 @@ pub unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
     swap_nonoverlapping_bytes(x, y, len)
 }
 
+#[inline]
+pub(crate) unsafe fn swap_nonoverlapping_one<T>(x: *mut T, y: *mut T) {
+    // For types smaller than the block optimization below,
+    // just swap directly to avoid pessimizing codegen.
+    if mem::size_of::<T>() < 32 {
+        let z = read(x);
+        copy_nonoverlapping(y, x, 1);
+        write(y, z);
+    } else {
+        swap_nonoverlapping(x, y, 1);
+    }
+}
+
 #[inline]
 unsafe fn swap_nonoverlapping_bytes(x: *mut u8, y: *mut u8, len: usize) {
     // The approach here is to utilize simd to swap x & y efficiently. Testing reveals
diff --git a/src/test/codegen/swap-small-types.rs b/src/test/codegen/swap-small-types.rs
@@ -0,0 +1,27 @@
+// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// compile-flags: -O
+// only-x86_64
+
+#![crate_type = "lib"]
+
+use std::mem::swap;
+
+type RGB48 = [u16; 3];
+
+// CHECK-LABEL: @swap_rgb48
+#[no_mangle]
+pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
+// CHECK-NOT: alloca
+// CHECK: load i48
+// CHECK: store i48
+    swap(x, y)
+}

Original file line number	Diff line number	Diff line change
`@@ -638,7 +638,7 @@ pub unsafe fn uninitialized<T>() -> T {`
`638`	`638`	`#[stable(feature = "rust1", since = "1.0.0")]`
`639`	`639`	`pub fn swap<T>(x: &mut T, y: &mut T) {`
`640`	`640`	`unsafe {`
`641`		`- ptr::swap_nonoverlapping(x, y, 1);`
	`641`	`+ ptr::swap_nonoverlapping_one(x, y);`
`642`	`642`	`}`
`643`	`643`	`}`
`644`	`644`