diff --git a/lib/std/target.zig b/lib/std/target.zig
index c23a42c9637a..2db1415cd34f 100644
--- a/lib/std/target.zig
+++ b/lib/std/target.zig
@@ -1773,6 +1773,88 @@ pub const Target = struct {
             else => false,
         };
     }
+
+    pub inline fn maxIntAlignment(target: Target) u16 {
+        return switch (target.cpu.arch) {
+            .avr => 1,
+            .msp430 => 2,
+            .xcore => 4,
+
+            .arm,
+            .armeb,
+            .thumb,
+            .thumbeb,
+            .hexagon,
+            .mips,
+            .mipsel,
+            .powerpc,
+            .powerpcle,
+            .r600,
+            .amdgcn,
+            .riscv32,
+            .sparc,
+            .sparcel,
+            .s390x,
+            .lanai,
+            .wasm32,
+            .wasm64,
+            => 8,
+
+            .i386 => return switch (target.os.tag) {
+                .windows => 8,
+                else => 4,
+            },
+
+            // For x86_64, LLVMABIAlignmentOfType(i128) reports 8. However I think 16
+            // is a better number for two reasons:
+            // 1. Better machine code when loading into SIMD register.
+            // 2. The C ABI wants 16 for extern structs.
+            // 3. 16-byte cmpxchg needs 16-byte alignment.
+            // Same logic for riscv64, powerpc64, mips64, sparcv9.
+            .x86_64,
+            .riscv64,
+            .powerpc64,
+            .powerpc64le,
+            .mips64,
+            .mips64el,
+            .sparcv9,
+
+            // Even LLVMABIAlignmentOfType(i128) agrees on these targets.
+            .aarch64,
+            .aarch64_be,
+            .aarch64_32,
+            .bpfel,
+            .bpfeb,
+            .nvptx,
+            .nvptx64,
+            => 16,
+
+            // Below this comment are unverified but based on the fact that C requires
+            // int128_t to be 16 bytes aligned, it's a safe default.
+            .spu_2,
+            .csky,
+            .arc,
+            .m68k,
+            .tce,
+            .tcele,
+            .le32,
+            .amdil,
+            .hsail,
+            .spir,
+            .kalimba,
+            .renderscript32,
+            .spirv32,
+            .shave,
+            .le64,
+            .amdil64,
+            .hsail64,
+            .spir64,
+            .renderscript64,
+            .ve,
+            .spirv64,
+            => 16,
+        };
+    }
 };
 
 test {
diff --git a/src/AstGen.zig b/src/AstGen.zig
index 1da1cf187e26..d4895aa2e4e8 100644
--- a/src/AstGen.zig
+++ b/src/AstGen.zig
@@ -7423,42 +7423,22 @@ fn builtinCall(
         },
 
         .atomic_load => {
-            const int_type = try typeExpr(gz, scope, params[0]);
-            // TODO allow this pointer type to be volatile
-            const ptr_type = try gz.add(.{ .tag = .ptr_type_simple, .data = .{
-                .ptr_type_simple = .{
-                    .is_allowzero = false,
-                    .is_mutable = false,
-                    .is_volatile = false,
-                    .size = .One,
-                    .elem_type = int_type,
-                },
-            } });
-            const result = try gz.addPlNode(.atomic_load, node, Zir.Inst.Bin{
+            const result = try gz.addPlNode(.atomic_load, node, Zir.Inst.AtomicLoad{
                 // zig fmt: off
-                .lhs = try expr(gz, scope, .{ .coerced_ty = ptr_type },           params[1]),
-                .rhs = try expr(gz, scope, .{ .coerced_ty = .atomic_order_type }, params[2]),
+                .elem_type = try typeExpr(gz, scope,                                        params[0]),
+                .ptr       = try expr    (gz, scope, .none,                                 params[1]),
+                .ordering  = try expr    (gz, scope, .{ .coerced_ty = .atomic_order_type }, params[2]),
                 // zig fmt: on
             });
             return rvalue(gz, rl, result, node);
         },
         .atomic_rmw => {
             const int_type = try typeExpr(gz, scope, params[0]);
-            // TODO allow this pointer type to be volatile
-            const ptr_type = try gz.add(.{ .tag = .ptr_type_simple, .data = .{
-                .ptr_type_simple = .{
-                    .is_allowzero = false,
-                    .is_mutable = true,
-                    .is_volatile = false,
-                    .size = .One,
-                    .elem_type = int_type,
-                },
-            } });
             const result = try gz.addPlNode(.atomic_rmw, node, Zir.Inst.AtomicRmw{
                 // zig fmt: off
-                .ptr       = try expr(gz, scope, .{ .coerced_ty = ptr_type },            params[1]),
+                .ptr       = try expr(gz, scope, .none,                                  params[1]),
                 .operation = try expr(gz, scope, .{ .coerced_ty = .atomic_rmw_op_type }, params[2]),
-                .operand   = try expr(gz, scope, .{ .coerced_ty = int_type },            params[3]),
+                .operand   = try expr(gz, scope, .{ .ty = int_type },                    params[3]),
                 .ordering  = try expr(gz, scope, .{ .coerced_ty = .atomic_order_type },  params[4]),
                 // zig fmt: on
             });
@@ -7466,20 +7446,10 @@ fn builtinCall(
         },
         .atomic_store => {
             const int_type = try typeExpr(gz, scope, params[0]);
-            // TODO allow this pointer type to be volatile
-            const ptr_type = try gz.add(.{ .tag = .ptr_type_simple, .data = .{
-                .ptr_type_simple = .{
-                    .is_allowzero = false,
-                    .is_mutable = true,
-                    .is_volatile = false,
-                    .size = .One,
-                    .elem_type = int_type,
-                },
-            } });
             const result = try gz.addPlNode(.atomic_store, node, Zir.Inst.AtomicStore{
                 // zig fmt: off
-                .ptr      = try expr(gz, scope, .{ .coerced_ty = ptr_type },           params[1]),
-                .operand  = try expr(gz, scope, .{ .coerced_ty = int_type },           params[2]),
+                .ptr      = try expr(gz, scope, .none,                                 params[1]),
+                .operand  = try expr(gz, scope, .{ .ty = int_type },                   params[2]),
                 .ordering = try expr(gz, scope, .{ .coerced_ty = .atomic_order_type }, params[3]),
                 // zig fmt: on
             });
@@ -7684,20 +7654,10 @@ fn cmpxchg(
     tag: Zir.Inst.Tag,
 ) InnerError!Zir.Inst.Ref {
     const int_type = try typeExpr(gz, scope, params[0]);
-    // TODO: allow this to be volatile
-    const ptr_type = try gz.add(.{ .tag = .ptr_type_simple, .data = .{
-        .ptr_type_simple = .{
-            .is_allowzero = false,
-            .is_mutable = true,
-            .is_volatile = false,
-            .size = .One,
-            .elem_type = int_type,
-        },
-    } });
     const result = try gz.addPlNode(tag, node, Zir.Inst.Cmpxchg{
         // zig fmt: off
-        .ptr            = try expr(gz, scope, .{ .coerced_ty = ptr_type },           params[1]),
-        .expected_value = try expr(gz, scope, .{ .coerced_ty = int_type },           params[2]),
+        .ptr            = try expr(gz, scope, .none,                                 params[1]),
+        .expected_value = try expr(gz, scope, .{ .ty = int_type },                   params[2]),
         .new_value      = try expr(gz, scope, .{ .coerced_ty = int_type },           params[3]),
         .success_order  = try expr(gz, scope, .{ .coerced_ty = .atomic_order_type }, params[4]),
         .failure_order  = try expr(gz, scope, .{ .coerced_ty = .atomic_order_type }, params[5]),
diff --git a/src/Module.zig b/src/Module.zig
index 55ec1fdd2c50..864663ada212 100644
--- a/src/Module.zig
+++ b/src/Module.zig
@@ -151,6 +151,8 @@ allocated_decls: std.SegmentedList(Decl, 0) = .{},
 /// When a Decl object is freed from `allocated_decls`, it is pushed into this stack.
 decls_free_list: std.ArrayListUnmanaged(Decl.Index) = .{},
 
+global_assembly: std.AutoHashMapUnmanaged(Decl.Index, []u8) = .{},
+
 const MonomorphedFuncsSet = std.HashMapUnmanaged(
     *Fn,
     void,
@@ -2831,6 +2833,7 @@ pub fn deinit(mod: *Module) void {
 
     mod.decls_free_list.deinit(gpa);
     mod.allocated_decls.deinit(gpa);
+    mod.global_assembly.deinit(gpa);
 }
 
 pub fn destroyDecl(mod: *Module, decl_index: Decl.Index) void {
@@ -2842,6 +2845,9 @@ pub fn destroyDecl(mod: *Module, decl_index: Decl.Index) void {
         if (decl.deletion_flag) {
             assert(mod.deletion_set.swapRemove(decl_index));
         }
+        if (mod.global_assembly.fetchRemove(decl_index)) |kv| {
+            gpa.free(kv.value);
+        }
         if (decl.has_tv) {
             if (decl.getInnerNamespace()) |namespace| {
                 namespace.destroyDecls(mod);
@@ -5714,3 +5720,12 @@ pub fn markDeclAlive(mod: *Module, decl: *Decl) void {
 fn markDeclIndexAlive(mod: *Module, decl_index: Decl.Index) void {
     return mod.markDeclAlive(mod.declPtr(decl_index));
 }
+
+pub fn addGlobalAssembly(mod: *Module, decl_index: Decl.Index, source: []const u8) !void {
+    try mod.global_assembly.ensureUnusedCapacity(mod.gpa, 1);
+
+    const duped_source = try mod.gpa.dupe(u8, source);
+    errdefer mod.gpa.free(duped_source);
+
+    mod.global_assembly.putAssumeCapacityNoClobber(decl_index, duped_source);
+}
diff --git a/src/Sema.zig b/src/Sema.zig
index 3ad955017509..9b4977c616ed 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -10517,16 +10517,35 @@ fn zirAsm(
     const is_volatile = @truncate(u1, extended.small >> 15) != 0;
     const is_global_assembly = sema.func == null;
 
-    if (block.is_comptime and !is_global_assembly) {
-        try sema.requireRuntimeBlock(block, src);
-    }
-
     if (extra.data.asm_source == 0) {
         // This can move to become an AstGen error after inline assembly improvements land
         // and stage1 code matches stage2 code.
         return sema.fail(block, src, "assembly code must use string literal syntax", .{});
     }
 
+    const asm_source = sema.code.nullTerminatedString(extra.data.asm_source);
+
+    if (is_global_assembly) {
+        if (outputs_len != 0) {
+            return sema.fail(block, src, "module-level assembly does not support outputs", .{});
+        }
+        if (inputs_len != 0) {
+            return sema.fail(block, src, "module-level assembly does not support inputs", .{});
+        }
+        if (clobbers_len != 0) {
+            return sema.fail(block, src, "module-level assembly does not support clobbers", .{});
+        }
+        if (is_volatile) {
+            return sema.fail(block, src, "volatile keyword is redundant on module-level assembly", .{});
+        }
+        try sema.mod.addGlobalAssembly(sema.owner_decl_index, asm_source);
+        return Air.Inst.Ref.void_value;
+    }
+
+    if (block.is_comptime) {
+        try sema.requireRuntimeBlock(block, src);
+    }
+
     if (outputs_len > 1) {
         return sema.fail(block, src, "TODO implement Sema for asm with more than 1 output", .{});
     }
@@ -10591,7 +10610,6 @@ fn zirAsm(
         needed_capacity += name.*.len / 4 + 1;
     }
 
-    const asm_source = sema.code.nullTerminatedString(extra.data.asm_source);
     needed_capacity += (asm_source.len + 3) / 4;
 
     const gpa = sema.gpa;
@@ -14715,51 +14733,64 @@ fn checkNumericType(
     }
 }
 
-fn checkAtomicOperandType(
+/// Returns the casted pointer.
+fn checkAtomicPtrOperand(
     sema: *Sema,
     block: *Block,
-    ty_src: LazySrcLoc,
-    ty: Type,
-) CompileError!void {
-    var buffer: Type.Payload.Bits = undefined;
+    elem_ty: Type,
+    elem_ty_src: LazySrcLoc,
+    ptr: Air.Inst.Ref,
+    ptr_src: LazySrcLoc,
+    ptr_const: bool,
+) CompileError!Air.Inst.Ref {
     const target = sema.mod.getTarget();
-    const max_atomic_bits = target_util.largestAtomicBits(target);
-    const int_ty = switch (ty.zigTypeTag()) {
-        .Int => ty,
-        .Enum => ty.intTagType(&buffer),
-        .Float => {
-            const bit_count = ty.floatBits(target);
-            if (bit_count > max_atomic_bits) {
-                return sema.fail(
-                    block,
-                    ty_src,
-                    "expected {d}-bit float type or smaller; found {d}-bit float type",
-                    .{ max_atomic_bits, bit_count },
-                );
-            }
-            return;
-        },
-        .Bool => return, // Will be treated as `u8`.
-        else => {
-            if (ty.isPtrAtRuntime()) return;
+    var diag: target_util.AtomicPtrAlignmentDiagnostics = .{};
+    const alignment = target_util.atomicPtrAlignment(target, elem_ty, &diag) catch |err| switch (err) {
+        error.FloatTooBig => return sema.fail(
+            block,
+            elem_ty_src,
+            "expected {d}-bit float type or smaller; found {d}-bit float type",
+            .{ diag.max_bits, diag.bits },
+        ),
+        error.IntTooBig => return sema.fail(
+            block,
+            elem_ty_src,
+            "expected {d}-bit integer type or smaller; found {d}-bit integer type",
+            .{ diag.max_bits, diag.bits },
+        ),
+        error.BadType => return sema.fail(
+            block,
+            elem_ty_src,
+            "expected bool, integer, float, enum, or pointer type; found {}",
+            .{elem_ty.fmt(sema.mod)},
+        ),
+    };
 
-            return sema.fail(
-                block,
-                ty_src,
-                "expected bool, integer, float, enum, or pointer type; found {}",
-                .{ty.fmt(sema.mod)},
-            );
+    var wanted_ptr_data: Type.Payload.Pointer.Data = .{
+        .pointee_type = elem_ty,
+        .@"align" = alignment,
+        .@"addrspace" = .generic,
+        .mutable = !ptr_const,
+    };
+
+    const ptr_ty = sema.typeOf(ptr);
+    const ptr_data = switch (try ptr_ty.zigTypeTagOrPoison()) {
+        .Pointer => ptr_ty.ptrInfo().data,
+        else => {
+            const wanted_ptr_ty = try Type.ptr(sema.arena, sema.mod, wanted_ptr_data);
+            _ = try sema.coerce(block, wanted_ptr_ty, ptr, ptr_src);
+            unreachable;
         },
     };
-    const bit_count = int_ty.intInfo(target).bits;
-    if (bit_count > max_atomic_bits) {
-        return sema.fail(
-            block,
-            ty_src,
-            "expected {d}-bit integer type or smaller; found {d}-bit integer type",
-            .{ max_atomic_bits, bit_count },
-        );
-    }
+
+    wanted_ptr_data.@"addrspace" = ptr_data.@"addrspace";
+    wanted_ptr_data.@"allowzero" = ptr_data.@"allowzero";
+    wanted_ptr_data.@"volatile" = ptr_data.@"volatile";
+
+    const wanted_ptr_ty = try Type.ptr(sema.arena, sema.mod, wanted_ptr_data);
+    const casted_ptr = try sema.coerce(block, wanted_ptr_ty, ptr, ptr_src);
+
+    return casted_ptr;
 }
 
 fn checkPtrIsNotComptimeMutable(
@@ -15036,10 +15067,8 @@ fn zirCmpxchg(
     const success_order_src: LazySrcLoc = .{ .node_offset_builtin_call_arg4 = inst_data.src_node };
     const failure_order_src: LazySrcLoc = .{ .node_offset_builtin_call_arg5 = inst_data.src_node };
     // zig fmt: on
-    const ptr = sema.resolveInst(extra.ptr);
-    const ptr_ty = sema.typeOf(ptr);
-    const elem_ty = ptr_ty.elemType();
-    try sema.checkAtomicOperandType(block, elem_ty_src, elem_ty);
+    const expected_value = sema.resolveInst(extra.expected_value);
+    const elem_ty = sema.typeOf(expected_value);
     if (elem_ty.zigTypeTag() == .Float) {
         return sema.fail(
             block,
@@ -15048,7 +15077,8 @@ fn zirCmpxchg(
             .{elem_ty.fmt(sema.mod)},
         );
     }
-    const expected_value = try sema.coerce(block, elem_ty, sema.resolveInst(extra.expected_value), expected_src);
+    const uncasted_ptr = sema.resolveInst(extra.ptr);
+    const ptr = try sema.checkAtomicPtrOperand(block, elem_ty, elem_ty_src, uncasted_ptr, ptr_src, false);
     const new_value = try sema.coerce(block, elem_ty, sema.resolveInst(extra.new_value), new_value_src);
     const success_order = try sema.resolveAtomicOrder(block, success_order_src, extra.success_order);
     const failure_order = try sema.resolveAtomicOrder(block, failure_order_src, extra.failure_order);
@@ -15081,6 +15111,7 @@ fn zirCmpxchg(
                     // to become undef as well
                     return sema.addConstUndef(result_ty);
                 }
+                const ptr_ty = sema.typeOf(ptr);
                 const stored_val = (try sema.pointerDeref(block, ptr_src, ptr_val, ptr_ty)) orelse break :rs ptr_src;
                 const result_val = if (stored_val.eql(expected_val, elem_ty, sema.mod)) blk: {
                     try sema.storePtr(block, src, ptr, new_value);
@@ -15487,17 +15518,16 @@ fn zirSelect(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.
 
 fn zirAtomicLoad(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
     const inst_data = sema.code.instructions.items(.data)[inst].pl_node;
-    const extra = sema.code.extraData(Zir.Inst.Bin, inst_data.payload_index).data;
+    const extra = sema.code.extraData(Zir.Inst.AtomicLoad, inst_data.payload_index).data;
     // zig fmt: off
     const elem_ty_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = inst_data.src_node };
     const ptr_src    : LazySrcLoc = .{ .node_offset_builtin_call_arg1 = inst_data.src_node };
     const order_src  : LazySrcLoc = .{ .node_offset_builtin_call_arg2 = inst_data.src_node };
     // zig fmt: on
-    const ptr = sema.resolveInst(extra.lhs);
-    const ptr_ty = sema.typeOf(ptr);
-    const elem_ty = ptr_ty.elemType();
-    try sema.checkAtomicOperandType(block, elem_ty_src, elem_ty);
-    const order = try sema.resolveAtomicOrder(block, order_src, extra.rhs);
+    const elem_ty = try sema.resolveType(block, elem_ty_src, extra.elem_type);
+    const uncasted_ptr = sema.resolveInst(extra.ptr);
+    const ptr = try sema.checkAtomicPtrOperand(block, elem_ty, elem_ty_src, uncasted_ptr, ptr_src, true);
+    const order = try sema.resolveAtomicOrder(block, order_src, extra.ordering);
 
     switch (order) {
         .Release, .AcqRel => {
@@ -15516,7 +15546,7 @@ fn zirAtomicLoad(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!
     }
 
     if (try sema.resolveDefinedValue(block, ptr_src, ptr)) |ptr_val| {
-        if (try sema.pointerDeref(block, ptr_src, ptr_val, ptr_ty)) |elem_val| {
+        if (try sema.pointerDeref(block, ptr_src, ptr_val, sema.typeOf(ptr))) |elem_val| {
             return sema.addConstant(elem_ty, elem_val);
         }
     }
@@ -15536,19 +15566,19 @@ fn zirAtomicRmw(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
     const extra = sema.code.extraData(Zir.Inst.AtomicRmw, inst_data.payload_index).data;
     const src = inst_data.src();
     // zig fmt: off
-    const operand_ty_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = inst_data.src_node };
+    const elem_ty_src   : LazySrcLoc = .{ .node_offset_builtin_call_arg0 = inst_data.src_node };
     const ptr_src       : LazySrcLoc = .{ .node_offset_builtin_call_arg1 = inst_data.src_node };
     const op_src        : LazySrcLoc = .{ .node_offset_builtin_call_arg2 = inst_data.src_node };
     const operand_src   : LazySrcLoc = .{ .node_offset_builtin_call_arg3 = inst_data.src_node };
     const order_src     : LazySrcLoc = .{ .node_offset_builtin_call_arg4 = inst_data.src_node };
     // zig fmt: on
-    const ptr = sema.resolveInst(extra.ptr);
-    const ptr_ty = sema.typeOf(ptr);
-    const operand_ty = ptr_ty.elemType();
-    try sema.checkAtomicOperandType(block, operand_ty_src, operand_ty);
+    const operand = sema.resolveInst(extra.operand);
+    const elem_ty = sema.typeOf(operand);
+    const uncasted_ptr = sema.resolveInst(extra.ptr);
+    const ptr = try sema.checkAtomicPtrOperand(block, elem_ty, elem_ty_src, uncasted_ptr, ptr_src, false);
     const op = try sema.resolveAtomicRmwOp(block, op_src, extra.operation);
 
-    switch (operand_ty.zigTypeTag()) {
+    switch (elem_ty.zigTypeTag()) {
         .Enum => if (op != .Xchg) {
             return sema.fail(block, op_src, "@atomicRmw with enum only allowed with .Xchg", .{});
         },
@@ -15561,7 +15591,6 @@ fn zirAtomicRmw(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
         },
         else => {},
     }
-    const operand = try sema.coerce(block, operand_ty, sema.resolveInst(extra.operand), operand_src);
     const order = try sema.resolveAtomicOrder(block, order_src, extra.ordering);
 
     if (order == .Unordered) {
@@ -15569,8 +15598,8 @@ fn zirAtomicRmw(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
     }
 
     // special case zero bit types
-    if (try sema.typeHasOnePossibleValue(block, operand_ty_src, operand_ty)) |val| {
-        return sema.addConstant(operand_ty, val);
+    if (try sema.typeHasOnePossibleValue(block, elem_ty_src, elem_ty)) |val| {
+        return sema.addConstant(elem_ty, val);
     }
 
     const runtime_src = if (try sema.resolveDefinedValue(block, ptr_src, ptr)) |ptr_val| rs: {
@@ -15581,22 +15610,23 @@ fn zirAtomicRmw(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
         };
         if (ptr_val.isComptimeMutablePtr()) {
             const target = sema.mod.getTarget();
+            const ptr_ty = sema.typeOf(ptr);
             const stored_val = (try sema.pointerDeref(block, ptr_src, ptr_val, ptr_ty)) orelse break :rs ptr_src;
             const new_val = switch (op) {
                 // zig fmt: off
                 .Xchg => operand_val,
-                .Add  => try stored_val.numberAddWrap(operand_val, operand_ty, sema.arena, target),
-                .Sub  => try stored_val.numberSubWrap(operand_val, operand_ty, sema.arena, target),
-                .And  => try stored_val.bitwiseAnd   (operand_val, operand_ty, sema.arena, target),
-                .Nand => try stored_val.bitwiseNand  (operand_val, operand_ty, sema.arena, target),
-                .Or   => try stored_val.bitwiseOr    (operand_val, operand_ty, sema.arena, target),
-                .Xor  => try stored_val.bitwiseXor   (operand_val, operand_ty, sema.arena, target),
-                .Max  =>     stored_val.numberMax    (operand_val,                         target),
-                .Min  =>     stored_val.numberMin    (operand_val,                         target),
+                .Add  => try stored_val.numberAddWrap(operand_val, elem_ty, sema.arena, target),
+                .Sub  => try stored_val.numberSubWrap(operand_val, elem_ty, sema.arena, target),
+                .And  => try stored_val.bitwiseAnd   (operand_val, elem_ty, sema.arena, target),
+                .Nand => try stored_val.bitwiseNand  (operand_val, elem_ty, sema.arena, target),
+                .Or   => try stored_val.bitwiseOr    (operand_val, elem_ty, sema.arena, target),
+                .Xor  => try stored_val.bitwiseXor   (operand_val, elem_ty, sema.arena, target),
+                .Max  =>     stored_val.numberMax    (operand_val,                      target),
+                .Min  =>     stored_val.numberMin    (operand_val,                      target),
                 // zig fmt: on
             };
-            try sema.storePtrVal(block, src, ptr_val, new_val, operand_ty);
-            return sema.addConstant(operand_ty, stored_val);
+            try sema.storePtrVal(block, src, ptr_val, new_val, elem_ty);
+            return sema.addConstant(elem_ty, stored_val);
         } else break :rs ptr_src;
     } else ptr_src;
 
@@ -15620,15 +15650,15 @@ fn zirAtomicStore(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError
     const extra = sema.code.extraData(Zir.Inst.AtomicStore, inst_data.payload_index).data;
     const src = inst_data.src();
     // zig fmt: off
-    const operand_ty_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = inst_data.src_node };
+    const elem_ty_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = inst_data.src_node };
     const ptr_src       : LazySrcLoc = .{ .node_offset_builtin_call_arg1 = inst_data.src_node };
     const operand_src   : LazySrcLoc = .{ .node_offset_builtin_call_arg2 = inst_data.src_node };
     const order_src     : LazySrcLoc = .{ .node_offset_builtin_call_arg3 = inst_data.src_node };
     // zig fmt: on
-    const ptr = sema.resolveInst(extra.ptr);
-    const operand_ty = sema.typeOf(ptr).elemType();
-    try sema.checkAtomicOperandType(block, operand_ty_src, operand_ty);
-    const operand = try sema.coerce(block, operand_ty, sema.resolveInst(extra.operand), operand_src);
+    const operand = sema.resolveInst(extra.operand);
+    const elem_ty = sema.typeOf(operand);
+    const uncasted_ptr = sema.resolveInst(extra.ptr);
+    const ptr = try sema.checkAtomicPtrOperand(block, elem_ty, elem_ty_src, uncasted_ptr, ptr_src, false);
     const order = try sema.resolveAtomicOrder(block, order_src, extra.ordering);
 
     const air_tag: Air.Inst.Tag = switch (order) {
diff --git a/src/Zir.zig b/src/Zir.zig
index 9ef6d33b865f..30edd6d0e1cc 100644
--- a/src/Zir.zig
+++ b/src/Zir.zig
@@ -903,7 +903,7 @@ pub const Inst = struct {
         /// Uses the `pl_node` union field with payload `Select`.
         select,
         /// Implements the `@atomicLoad` builtin.
-        /// Uses the `pl_node` union field with payload `Bin`.
+        /// Uses the `pl_node` union field with payload `AtomicLoad`.
         atomic_load,
         /// Implements the `@atomicRmw` builtin.
         /// Uses the `pl_node` union field with payload `AtomicRmw`.
@@ -3293,6 +3293,12 @@ pub const Inst = struct {
         ordering: Ref,
     };
 
+    pub const AtomicLoad = struct {
+        elem_type: Ref,
+        ptr: Ref,
+        ordering: Ref,
+    };
+
     pub const MulAdd = struct {
         mulend1: Ref,
         mulend2: Ref,
diff --git a/src/arch/x86_64/abi.zig b/src/arch/x86_64/abi.zig
index f388c5c93a79..da2e3da3947f 100644
--- a/src/arch/x86_64/abi.zig
+++ b/src/arch/x86_64/abi.zig
@@ -12,13 +12,10 @@ pub fn classifyWindows(ty: Type, target: Target) Class {
     // and the registers used for those arguments. Any argument that doesn't fit in 8
     // bytes, or isn't 1, 2, 4, or 8 bytes, must be passed by reference. A single argument
     // is never spread across multiple registers."
+    // "All floating point operations are done using the 16 XMM registers."
     // "Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed
     // as if they were integers of the same size."
-    switch (ty.abiSize(target)) {
-        1, 2, 4, 8 => {},
-        else => return .memory,
-    }
-    return switch (ty.zigTypeTag()) {
+    switch (ty.zigTypeTag()) {
         .Pointer,
         .Int,
         .Bool,
@@ -33,9 +30,13 @@ pub fn classifyWindows(ty: Type, target: Target) Class {
         .ErrorUnion,
         .AnyFrame,
         .Frame,
-        => .integer,
+        => switch (ty.abiSize(target)) {
+            0 => unreachable,
+            1, 2, 4, 8 => return .integer,
+            else => return .memory,
+        },
 
-        .Float, .Vector => .sse,
+        .Float, .Vector => return .sse,
 
         .Type,
         .ComptimeFloat,
@@ -47,7 +48,7 @@ pub fn classifyWindows(ty: Type, target: Target) Class {
         .Opaque,
         .EnumLiteral,
         => unreachable,
-    };
+    }
 }
 
 /// There are a maximum of 8 possible return slots. Returned values are in
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 6c74fc822334..426ef7c378f9 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -476,6 +476,19 @@ pub const Object = struct {
         _ = builder.buildRet(is_lt);
     }
 
+    fn genModuleLevelAssembly(object: *Object, comp: *Compilation) !void {
+        const mod = comp.bin_file.options.module.?;
+        if (mod.global_assembly.count() == 0) return;
+        var buffer = std.ArrayList(u8).init(comp.gpa);
+        defer buffer.deinit();
+        var it = mod.global_assembly.iterator();
+        while (it.next()) |kv| {
+            try buffer.appendSlice(kv.value_ptr.*);
+            try buffer.append('\n');
+        }
+        object.llvm_module.setModuleInlineAsm2(buffer.items.ptr, buffer.items.len - 1);
+    }
+
     pub fn flushModule(self: *Object, comp: *Compilation, prog_node: *std.Progress.Node) !void {
         var sub_prog_node = prog_node.start("LLVM Emit Object", 0);
         sub_prog_node.activate();
@@ -484,6 +497,7 @@ pub const Object = struct {
 
         try self.genErrorNameTable(comp);
         try self.genCmpLtErrorsLenFunction(comp);
+        try self.genModuleLevelAssembly(comp);
 
         if (self.di_builder) |dib| {
             // When lowering debug info for pointers, we emitted the element types as
@@ -630,7 +644,17 @@ pub const Object = struct {
             if (!param_ty.hasRuntimeBitsIgnoreComptime()) continue;
 
             const llvm_arg_i = @intCast(c_uint, args.items.len) + param_offset;
-            try args.append(llvm_func.getParam(llvm_arg_i));
+            const param = llvm_func.getParam(llvm_arg_i);
+            // It is possible for the calling convention to make the argument's by-reference nature
+            // disagree with our canonical value for it, in which case we must dereference here.
+            const need_deref = !param_ty.isPtrAtRuntime() and !isByRef(param_ty) and
+                (param.typeOf().getTypeKind() == .Pointer);
+            const loaded_param = if (!need_deref) param else l: {
+                const load_inst = builder.buildLoad(param, "");
+                load_inst.setAlignment(param_ty.abiAlignment(target));
+                break :l load_inst;
+            };
+            try args.append(loaded_param);
         }
 
         var di_file: ?*llvm.DIFile = null;
@@ -3729,6 +3753,19 @@ pub const FuncGen = struct {
                 arg_ptr.setAlignment(alignment);
                 const store_inst = self.builder.buildStore(llvm_arg, arg_ptr);
                 store_inst.setAlignment(alignment);
+
+                if (abi_llvm_ty.getTypeKind() == .Pointer) {
+                    // In this case, the calling convention wants a pointer, but
+                    // we have a value.
+                    if (arg_ptr.typeOf() == abi_llvm_ty) {
+                        try llvm_args.append(arg_ptr);
+                        continue;
+                    }
+                    const casted_ptr = self.builder.buildBitCast(arg_ptr, abi_llvm_ty, "");
+                    try llvm_args.append(casted_ptr);
+                    continue;
+                }
+
                 break :p self.builder.buildBitCast(arg_ptr, ptr_abi_ty, "");
             };
 
@@ -7583,7 +7620,7 @@ pub const FuncGen = struct {
         const size_bytes = elem_ty.abiSize(target);
         _ = self.builder.buildMemCpy(
             self.builder.buildBitCast(ptr, llvm_ptr_u8, ""),
-            ptr_ty.ptrAlignment(target),
+            ptr_alignment,
             self.builder.buildBitCast(elem, llvm_ptr_u8, ""),
             elem_ty.abiAlignment(target),
             self.context.intType(Type.usize.intInfo(target).bits).constInt(size_bytes, .False),
@@ -7917,6 +7954,8 @@ fn llvmFieldIndex(
 }
 
 fn firstParamSRet(fn_info: Type.Payload.Function.Data, target: std.Target) bool {
+    if (!fn_info.return_type.hasRuntimeBitsIgnoreComptime()) return false;
+
     switch (fn_info.cc) {
         .Unspecified, .Inline => return isByRef(fn_info.return_type),
         .C => switch (target.cpu.arch) {
@@ -8016,7 +8055,8 @@ fn lowerFnRetTy(dg: *DeclGen, fn_info: Type.Payload.Function.Data) !*const llvm.
                             }
                         }
                         if (classes[0] == .integer and classes[1] == .none) {
-                            return llvm_types_buffer[0];
+                            const abi_size = fn_info.return_type.abiSize(target);
+                            return dg.context.intType(@intCast(c_uint, abi_size * 8));
                         }
                         return dg.context.structType(&llvm_types_buffer, llvm_types_index, .False);
                     },
@@ -8110,7 +8150,8 @@ fn lowerFnParamTy(dg: *DeclGen, cc: std.builtin.CallingConvention, ty: Type) !*c
                             }
                         }
                         if (classes[0] == .integer and classes[1] == .none) {
-                            return llvm_types_buffer[0];
+                            const abi_size = ty.abiSize(target);
+                            return dg.context.intType(@intCast(c_uint, abi_size * 8));
                         }
                         return dg.context.structType(&llvm_types_buffer, llvm_types_index, .False);
                     },
diff --git a/src/codegen/llvm/bindings.zig b/src/codegen/llvm/bindings.zig
index 560b9544dd41..4732e0dd490b 100644
--- a/src/codegen/llvm/bindings.zig
+++ b/src/codegen/llvm/bindings.zig
@@ -381,6 +381,9 @@ pub const Module = opaque {
 
     pub const createDIBuilder = ZigLLVMCreateDIBuilder;
     extern fn ZigLLVMCreateDIBuilder(module: *const Module, allow_unresolved: bool) *DIBuilder;
+
+    pub const setModuleInlineAsm2 = LLVMSetModuleInlineAsm2;
+    extern fn LLVMSetModuleInlineAsm2(M: *const Module, Asm: [*]const u8, Len: usize) void;
 };
 
 pub const lookupIntrinsicID = LLVMLookupIntrinsicID;
diff --git a/src/print_zir.zig b/src/print_zir.zig
index ac90c26cf65c..a9d0f4690f5b 100644
--- a/src/print_zir.zig
+++ b/src/print_zir.zig
@@ -283,6 +283,7 @@ const Writer = struct {
             => try self.writeStructInit(stream, inst),
 
             .cmpxchg_strong, .cmpxchg_weak => try self.writeCmpxchg(stream, inst),
+            .atomic_load => try self.writeAtomicLoad(stream, inst),
             .atomic_store => try self.writeAtomicStore(stream, inst),
             .atomic_rmw => try self.writeAtomicRmw(stream, inst),
             .memcpy => try self.writeMemcpy(stream, inst),
@@ -351,7 +352,6 @@ const Writer = struct {
             .offset_of,
             .splat,
             .reduce,
-            .atomic_load,
             .bitcast,
             .vector_type,
             .maximum,
@@ -929,6 +929,19 @@ const Writer = struct {
         try self.writeSrc(stream, inst_data.src());
     }
 
+    fn writeAtomicLoad(self: *Writer, stream: anytype, inst: Zir.Inst.Index) !void {
+        const inst_data = self.code.instructions.items(.data)[inst].pl_node;
+        const extra = self.code.extraData(Zir.Inst.AtomicLoad, inst_data.payload_index).data;
+
+        try self.writeInstRef(stream, extra.elem_type);
+        try stream.writeAll(", ");
+        try self.writeInstRef(stream, extra.ptr);
+        try stream.writeAll(", ");
+        try self.writeInstRef(stream, extra.ordering);
+        try stream.writeAll(") ");
+        try self.writeSrc(stream, inst_data.src());
+    }
+
     fn writeAtomicStore(self: *Writer, stream: anytype, inst: Zir.Inst.Index) !void {
         const inst_data = self.code.instructions.items(.data)[inst].pl_node;
         const extra = self.code.extraData(Zir.Inst.AtomicStore, inst_data.payload_index).data;
diff --git a/src/stage1/analyze.cpp b/src/stage1/analyze.cpp
index aef4966ee71f..34dff556e0ee 100644
--- a/src/stage1/analyze.cpp
+++ b/src/stage1/analyze.cpp
@@ -7686,6 +7686,7 @@ ZigType *make_int_type(CodeGen *g, bool is_signed, uint32_t size_in_bits) {
             // However for some targets, LLVM incorrectly reports this as 8.
             // See: https://github.com/ziglang/zig/issues/2987
             entry->abi_align = 16;
+            entry->abi_size = align_forward(entry->abi_size, entry->abi_align);
         }
     }
 
diff --git a/src/target.zig b/src/target.zig
index 27ed1118dbbc..9249ed1b606f 100644
--- a/src/target.zig
+++ b/src/target.zig
@@ -1,5 +1,6 @@
 const std = @import("std");
 const llvm = @import("codegen/llvm/bindings.zig");
+const Type = @import("type.zig").Type;
 
 pub const ArchOsAbi = struct {
     arch: std.Target.Cpu.Arch,
@@ -543,10 +544,28 @@ pub fn needUnwindTables(target: std.Target) bool {
     return target.os.tag == .windows;
 }
 
-/// TODO this was ported from stage1 but it does not take into account CPU features,
-/// which can affect this value. Audit this!
-pub fn largestAtomicBits(target: std.Target) u32 {
-    return switch (target.cpu.arch) {
+pub const AtomicPtrAlignmentError = error{
+    FloatTooBig,
+    IntTooBig,
+    BadType,
+};
+
+pub const AtomicPtrAlignmentDiagnostics = struct {
+    bits: u16 = undefined,
+    max_bits: u16 = undefined,
+};
+
+/// If ABI alignment of `ty` is OK for atomic operations, returns 0.
+/// Otherwise returns the alignment required on a pointer for the target
+/// to perform atomic operations.
+pub fn atomicPtrAlignment(
+    target: std.Target,
+    ty: Type,
+    diags: *AtomicPtrAlignmentDiagnostics,
+) AtomicPtrAlignmentError!u32 {
+    // TODO this was ported from stage1 but it does not take into account CPU features,
+    // which can affect this value. Audit this!
+    const max_atomic_bits: u16 = switch (target.cpu.arch) {
         .avr,
         .msp430,
         .spu_2,
@@ -611,6 +630,40 @@ pub fn largestAtomicBits(target: std.Target) u32 {
 
         .x86_64 => 128,
     };
+
+    var buffer: Type.Payload.Bits = undefined;
+
+    const int_ty = switch (ty.zigTypeTag()) {
+        .Int => ty,
+        .Enum => ty.intTagType(&buffer),
+        .Float => {
+            const bit_count = ty.floatBits(target);
+            if (bit_count > max_atomic_bits) {
+                diags.* = .{
+                    .bits = bit_count,
+                    .max_bits = max_atomic_bits,
+                };
+                return error.FloatTooBig;
+            }
+            return 0;
+        },
+        .Bool => return 0,
+        else => {
+            if (ty.isPtrAtRuntime()) return 0;
+            return error.BadType;
+        },
+    };
+
+    const bit_count = int_ty.intInfo(target).bits;
+    if (bit_count > max_atomic_bits) {
+        diags.* = .{
+            .bits = bit_count,
+            .max_bits = max_atomic_bits,
+        };
+        return error.IntTooBig;
+    }
+
+    return 0;
 }
 
 pub fn defaultAddressSpace(
diff --git a/src/type.zig b/src/type.zig
index c96c512d9a09..ddeec596e18a 100644
--- a/src/type.zig
+++ b/src/type.zig
@@ -2788,11 +2788,6 @@ pub const Type = extern union {
                 return AbiAlignmentAdvanced{ .scalar = target_util.defaultFunctionAlignment(target) };
             },
 
-            .i16, .u16 => return AbiAlignmentAdvanced{ .scalar = 2 },
-            .i32, .u32 => return AbiAlignmentAdvanced{ .scalar = 4 },
-            .i64, .u64 => return AbiAlignmentAdvanced{ .scalar = 8 },
-            .u128, .i128 => return AbiAlignmentAdvanced{ .scalar = 16 },
-
             .isize,
             .usize,
             .single_const_pointer_to_comptime_int,
@@ -2865,14 +2860,15 @@ pub const Type = extern union {
             // ABI alignment of vectors?
             .vector => return AbiAlignmentAdvanced{ .scalar = 16 },
 
+            .i16, .u16 => return AbiAlignmentAdvanced{ .scalar = intAbiAlignment(16, target) },
+            .i32, .u32 => return AbiAlignmentAdvanced{ .scalar = intAbiAlignment(32, target) },
+            .i64, .u64 => return AbiAlignmentAdvanced{ .scalar = intAbiAlignment(64, target) },
+            .u128, .i128 => return AbiAlignmentAdvanced{ .scalar = intAbiAlignment(128, target) },
+
             .int_signed, .int_unsigned => {
                 const bits: u16 = ty.cast(Payload.Bits).?.data;
                 if (bits == 0) return AbiAlignmentAdvanced{ .scalar = 0 };
-                if (bits <= 8) return AbiAlignmentAdvanced{ .scalar = 1 };
-                if (bits <= 16) return AbiAlignmentAdvanced{ .scalar = 2 };
-                if (bits <= 32) return AbiAlignmentAdvanced{ .scalar = 4 };
-                if (bits <= 64) return AbiAlignmentAdvanced{ .scalar = 8 };
-                return AbiAlignmentAdvanced{ .scalar = 16 };
+                return AbiAlignmentAdvanced{ .scalar = intAbiAlignment(bits, target) };
             },
 
             .optional => {
@@ -3113,10 +3109,6 @@ pub const Type = extern union {
                 assert(elem_size >= payload.elem_type.abiAlignment(target));
                 return (payload.len + 1) * elem_size;
             },
-            .i16, .u16 => return 2,
-            .i32, .u32 => return 4,
-            .i64, .u64 => return 8,
-            .u128, .i128 => return 16,
 
             .isize,
             .usize,
@@ -3189,10 +3181,14 @@ pub const Type = extern union {
             .error_set_merged,
             => return 2, // TODO revisit this when we have the concept of the error tag type
 
+            .i16, .u16 => return intAbiSize(16, target),
+            .i32, .u32 => return intAbiSize(32, target),
+            .i64, .u64 => return intAbiSize(64, target),
+            .u128, .i128 => return intAbiSize(128, target),
             .int_signed, .int_unsigned => {
                 const bits: u16 = self.cast(Payload.Bits).?.data;
                 if (bits == 0) return 0;
-                return std.math.ceilPowerOfTwoPromote(u16, (bits + 7) / 8);
+                return intAbiSize(bits, target);
             },
 
             .optional => {
@@ -3234,6 +3230,18 @@ pub const Type = extern union {
         };
     }
 
+    fn intAbiSize(bits: u16, target: Target) u64 {
+        const alignment = intAbiAlignment(bits, target);
+        return std.mem.alignForwardGeneric(u64, (bits + 7) / 8, alignment);
+    }
+
+    fn intAbiAlignment(bits: u16, target: Target) u32 {
+        return @minimum(
+            std.math.ceilPowerOfTwoPromote(u16, (bits + 7) / 8),
+            target.maxIntAlignment(),
+        );
+    }
+
     /// Asserts the type has the bit size already resolved.
     pub fn bitSize(ty: Type, target: Target) u64 {
         return switch (ty.tag()) {
@@ -5169,7 +5177,7 @@ pub const Type = extern union {
 
             const field = it.struct_obj.fields.values()[it.field];
             defer it.field += 1;
-            if (!field.ty.hasRuntimeBits())
+            if (!field.ty.hasRuntimeBits() or field.is_comptime)
                 return FieldOffset{ .field = it.field, .offset = it.offset };
 
             const field_align = field.normalAlignment(it.target);
diff --git a/test/behavior/align.zig b/test/behavior/align.zig
index 3a35d1fcca5b..18a55f66532d 100644
--- a/test/behavior/align.zig
+++ b/test/behavior/align.zig
@@ -47,41 +47,128 @@ fn expects4(x: *align(4) u32) void {
     x.* += 1;
 }
 
-test "alignment of structs" {
+test "alignment of struct with pointer has same alignment as usize" {
     try expect(@alignOf(struct {
         a: i32,
         b: *i32,
     }) == @alignOf(usize));
 }
 
-test "alignment of >= 128-bit integer type" {
-    try expect(@alignOf(u128) == 16);
-    try expect(@alignOf(u129) == 16);
-}
-
-test "alignment of struct with 128-bit field" {
-    try expect(@alignOf(struct {
-        x: u128,
-    }) == 16);
-
-    comptime {
-        try expect(@alignOf(struct {
-            x: u128,
-        }) == 16);
+test "alignment and size of structs with 128-bit fields" {
+    if (builtin.zig_backend == .stage1) {
+        // stage1 gets the wrong answer for a lot of targets
+        return error.SkipZigTest;
     }
-}
+    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
 
-test "size of extern struct with 128-bit field" {
-    try expect(@sizeOf(extern struct {
+    const A = struct {
+        x: u128,
+    };
+    const B = extern struct {
         x: u128,
         y: u8,
-    }) == 32);
-
+    };
+    const expected = switch (builtin.cpu.arch) {
+        .arm,
+        .armeb,
+        .thumb,
+        .thumbeb,
+        .hexagon,
+        .mips,
+        .mipsel,
+        .powerpc,
+        .powerpcle,
+        .r600,
+        .amdgcn,
+        .riscv32,
+        .sparc,
+        .sparcel,
+        .s390x,
+        .lanai,
+        .wasm32,
+        .wasm64,
+        => .{
+            .a_align = 8,
+            .a_size = 16,
+
+            .b_align = 8,
+            .b_size = 24,
+
+            .u128_align = 8,
+            .u128_size = 16,
+            .u129_align = 8,
+            .u129_size = 24,
+        },
+
+        .i386 => switch (builtin.os.tag) {
+            .windows => .{
+                .a_align = 8,
+                .a_size = 16,
+
+                .b_align = 8,
+                .b_size = 24,
+
+                .u128_align = 8,
+                .u128_size = 16,
+                .u129_align = 8,
+                .u129_size = 24,
+            },
+            else => .{
+                .a_align = 4,
+                .a_size = 16,
+
+                .b_align = 4,
+                .b_size = 20,
+
+                .u128_align = 4,
+                .u128_size = 16,
+                .u129_align = 4,
+                .u129_size = 20,
+            },
+        },
+
+        .mips64,
+        .mips64el,
+        .powerpc64,
+        .powerpc64le,
+        .riscv64,
+        .sparcv9,
+        .x86_64,
+        .aarch64,
+        .aarch64_be,
+        .aarch64_32,
+        .bpfel,
+        .bpfeb,
+        .nvptx,
+        .nvptx64,
+        => .{
+            .a_align = 16,
+            .a_size = 16,
+
+            .b_align = 16,
+            .b_size = 32,
+
+            .u128_align = 16,
+            .u128_size = 16,
+            .u129_align = 16,
+            .u129_size = 32,
+        },
+
+        else => return error.SkipZigTest,
+    };
     comptime {
-        try expect(@sizeOf(extern struct {
-            x: u128,
-            y: u8,
-        }) == 32);
+        std.debug.assert(@alignOf(A) == expected.a_align);
+        std.debug.assert(@sizeOf(A) == expected.a_size);
+
+        std.debug.assert(@alignOf(B) == expected.b_align);
+        std.debug.assert(@sizeOf(B) == expected.b_size);
+
+        std.debug.assert(@alignOf(u128) == expected.u128_align);
+        std.debug.assert(@sizeOf(u128) == expected.u128_size);
+
+        std.debug.assert(@alignOf(u129) == expected.u129_align);
+        std.debug.assert(@sizeOf(u129) == expected.u129_size);
     }
 }
 
@@ -328,7 +415,6 @@ test "read 128-bit field from default aligned struct in stack memory" {
         .nevermind = 1,
         .badguy = 12,
     };
-    try expect((@ptrToInt(&default_aligned.badguy) % 16) == 0);
     try expect(12 == default_aligned.badguy);
 }
 
@@ -345,7 +431,6 @@ test "read 128-bit field from default aligned struct in global memory" {
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
 
-    try expect((@ptrToInt(&default_aligned_global.badguy) % 16) == 0);
     try expect(12 == default_aligned_global.badguy);
 }
 
diff --git a/test/behavior/asm.zig b/test/behavior/asm.zig
index dab6f12127fb..9aa95a3a0b05 100644
--- a/test/behavior/asm.zig
+++ b/test/behavior/asm.zig
@@ -23,7 +23,6 @@ test "module level assembly" {
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_llvm) return error.SkipZigTest; // TODO
 
     if (is_x86_64_linux) {
         try expect(this_is_my_alias() == 1234);
diff --git a/test/behavior/atomics.zig b/test/behavior/atomics.zig
index 71e17d9b4c1a..62471c5ea03b 100644
--- a/test/behavior/atomics.zig
+++ b/test/behavior/atomics.zig
@@ -127,7 +127,7 @@ test "128-bit cmpxchg" {
 }
 
 fn test_u128_cmpxchg() !void {
-    var x: u128 = 1234;
+    var x: u128 align(16) = 1234;
     if (@cmpxchgWeak(u128, &x, 99, 5678, .SeqCst, .SeqCst)) |x1| {
         try expect(x1 == 1234);
     } else {
diff --git a/test/behavior/bitcast.zig b/test/behavior/bitcast.zig
index 6be3cc15c43c..4922b9473e15 100644
--- a/test/behavior/bitcast.zig
+++ b/test/behavior/bitcast.zig
@@ -120,6 +120,10 @@ test "bitcast generates a temporary value" {
 }
 
 test "@bitCast packed structs at runtime and comptime" {
+    if (builtin.zig_backend == .stage1) {
+        // stage1 gets the wrong answer for a lot of targets
+        return error.SkipZigTest;
+    }
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
@@ -138,18 +142,9 @@ test "@bitCast packed structs at runtime and comptime" {
         fn doTheTest() !void {
             var full = Full{ .number = 0x1234 };
             var two_halves = @bitCast(Divided, full);
-            switch (native_endian) {
-                .Big => {
-                    try expect(two_halves.half1 == 0x12);
-                    try expect(two_halves.quarter3 == 0x3);
-                    try expect(two_halves.quarter4 == 0x4);
-                },
-                .Little => {
-                    try expect(two_halves.half1 == 0x34);
-                    try expect(two_halves.quarter3 == 0x2);
-                    try expect(two_halves.quarter4 == 0x1);
-                },
-            }
+            try expect(two_halves.half1 == 0x34);
+            try expect(two_halves.quarter3 == 0x2);
+            try expect(two_halves.quarter4 == 0x1);
         }
     };
     try S.doTheTest();
diff --git a/test/behavior/struct.zig b/test/behavior/struct.zig
index b05126db6225..24365d49b719 100644
--- a/test/behavior/struct.zig
+++ b/test/behavior/struct.zig
@@ -499,17 +499,18 @@ const Bitfields = packed struct {
     f7: u8,
 };
 
-test "native bit field understands endianness" {
-    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
+test "packed struct fields are ordered from LSB to MSB" {
+    if (builtin.zig_backend == .stage1) {
+        // stage1 gets the wrong answer for a lot of targets
+        return error.SkipZigTest;
+    }
+    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
 
-    var all: u64 = if (native_endian != .Little)
-        0x1111222233445677
-    else
-        0x7765443322221111;
+    var all: u64 = 0x7765443322221111;
     var bytes: [8]u8 = undefined;
     @memcpy(&bytes, @ptrCast([*]u8, &all), 8);
     var bitfields = @ptrCast(*Bitfields, &bytes).*;
@@ -974,6 +975,8 @@ test "comptime struct field" {
         comptime b: i32 = 1234,
     };
 
+    comptime std.debug.assert(@sizeOf(T) == 4);
+
     var foo: T = undefined;
     comptime try expect(foo.b == 1234);
 }