diff --git a/.gitignore b/.gitignore
index 10e20410..0ca0892d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,11 +1,9 @@
 # ignore python compiled object
 *.pyc
-# ignore vi temporary files
-*.swo
-*.swp
 .sconsign.dblite
 obj/
 cctest*
 bench_*
 libvixl*
 example-*
+vixl_stats.csv
diff --git a/README.md b/README.md
index e3c3a0ad..239a0d95 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-VIXL: AArch64 Runtime Code Generation Library Version 1.1
+VIXL: AArch64 Runtime Code Generation Library Version 1.2
 =========================================================
 
 Contents:
@@ -54,7 +54,7 @@ were deemed unnecessary:
 
  * No Advanced SIMD support.
  * Limited rounding mode support for floating point.
- * No support for synchronisation instructions.
+ * Limited support for synchronisation instructions.
  * Limited support for system instructions.
  * A few miscellaneous integer and floating point instructions are missing.
 
diff --git a/doc/changelog.md b/doc/changelog.md
index 8bab9323..09491e4e 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -1,6 +1,25 @@
 VIXL Change Log
 ===============
 
+* 1.2
+    + Added support for `fmadd`, `fnmadd`, `fnmsub`, `fminnm`, `fmaxnm`,
+      `frinta`, `fcvtau` and `fcvtas`.
+    + Added support for assembling and disassembling `isb`, `dsb` and `dmb`.
+    + Added support for automatic inversion of compare instructions when using
+      negative immediates.
+    + Added support for using `movn` when generating immediates.
+    + Added explicit flag-setting 'S' instructions, and removed
+      `SetFlags` and `LeaveFlags` arguments.
+    + Added support for `Movk` in macro assembler.
+    + Added support for W register parameters to `Tbz` and `Tbnz`.
+    + Added support for using immediate operands with `Csel`.
+    + Added new debugger syntax for memory inspection.
+    + Fixed `smull`, `fmsub` and `sdiv` simulation.
+    + Fixed sign extension for W->X conversions using `sxtb`, `sxth` and `sxtw`.
+    + Prevented code generation for certain side-effect free operations,
+      such as `add r, r, #0`, in the macro assembler.
+    + Other small bug fixes.
+
 * 1.1
     + Improved robustness of instruction decoder and disassembler.
     + Added support for double-to-float conversions using `fcvt`.
diff --git a/doc/supported-instructions.md b/doc/supported-instructions.md
index 90d63ec9..71839d4e 100644
--- a/doc/supported-instructions.md
+++ b/doc/supported-instructions.md
@@ -15,8 +15,16 @@ Add with carry bit.
 
     void adc(const Register& rd,
              const Register& rn,
-             const Operand& operand,
-             FlagsUpdate S = LeaveFlags)
+             const Operand& operand)
+
+
+### adcs ###
+
+Add with carry bit and update status flags.
+
+    void adcs(const Register& rd,
+              const Register& rn,
+              const Operand& operand)
 
 
 ### add ###
@@ -25,8 +33,16 @@ Add.
 
     void add(const Register& rd,
              const Register& rn,
-             const Operand& operand,
-             FlagsUpdate S = LeaveFlags)
+             const Operand& operand)
+
+
+### adds ###
+
+Add and update status flags.
+
+    void adds(const Register& rd,
+              const Register& rn,
+              const Operand& operand)
 
 
 ### adr ###
@@ -43,6 +59,24 @@ Calculate the address of a label.
     void adr(const Register& rd, Label* label)
 
 
+### and ###
+
+Bitwise and (A & B).
+
+    void and_(const Register& rd,
+              const Register& rn,
+              const Operand& operand)
+
+
+### ands ###
+
+Bitwise and (A & B) and update status flags.
+
+    void ands(const Register& rd,
+              const Register& rn,
+              const Operand& operand)
+
+
 ### asr ###
 
 Arithmetic shift right.
@@ -59,16 +93,30 @@ Arithmetic shift right by variable.
 
 ### b ###
 
-Branch to PC offset.
+Conditional branch to PC offset.
+
+    void b(int imm19, Condition cond)
+
+
+### b ###
+
+Conditional branch to label.
+
+    void b(Label* label, Condition cond)
+
+
+### b ###
+
+Unconditional branch to PC offset.
 
-    void b(int imm26, Condition cond = al)
+    void b(int imm26)
 
 
 ### b ###
 
-Branch to label.
+Unconditional branch to label.
 
-    void b(Label* label, Condition cond = al)
+    void b(Label* label)
 
 
 ### bfi ###
@@ -107,8 +155,16 @@ Bit clear (A & ~B).
 
     void bic(const Register& rd,
              const Register& rn,
-             const Operand& operand,
-             FlagsUpdate S = LeaveFlags)
+             const Operand& operand)
+
+
+### bics ###
+
+Bit clear (A & ~B) and update status flags.
+
+    void bics(const Register& rd,
+              const Register& rn,
+              const Operand& operand)
 
 
 ### bl ###
@@ -297,6 +353,20 @@ Conditional select negation: rd = cond ? rn : -rm.
                Condition cond)
 
 
+### dmb ###
+
+Data memory barrier.
+
+    void dmb(BarrierDomain domain, BarrierType type)
+
+
+### dsb ###
+
+Data synchronization barrier.
+
+    void dsb(BarrierDomain domain, BarrierType type)
+
+
 ### eon ###
 
 Bitwise enor/xnor (A ^ ~B).
@@ -335,6 +405,13 @@ Halting debug-mode breakpoint.
     void hlt(int code)
 
 
+### isb ###
+
+Instruction synchronization barrier.
+
+    void isb()
+
+
 ### ldnp ###
 
 Load integer or FP register pair, non-temporal.
@@ -530,8 +607,15 @@ Move inverted operand to register.
 Negate.
 
     void neg(const Register& rd,
-             const Operand& operand,
-             FlagsUpdate S = LeaveFlags)
+             const Operand& operand)
+
+
+### negs ###
+
+Negate and update status flags.
+
+    void negs(const Register& rd,
+              const Operand& operand)
 
 
 ### ngc ###
@@ -539,8 +623,15 @@ Negate.
 Negate with carry bit.
 
     void ngc(const Register& rd,
-             const Operand& operand,
-             FlagsUpdate S = LeaveFlags)
+             const Operand& operand)
+
+
+### ngcs ###
+
+Negate with carry bit and update status flags.
+
+    void ngcs(const Register& rd,
+              const Operand& operand)
 
 
 ### nop ###
@@ -619,8 +710,16 @@ Subtract with carry bit.
 
     void sbc(const Register& rd,
              const Register& rn,
-             const Operand& operand,
-             FlagsUpdate S = LeaveFlags)
+             const Operand& operand)
+
+
+### sbcs ###
+
+Subtract with carry bit and update status flags.
+
+    void sbcs(const Register& rd,
+              const Register& rn,
+              const Operand& operand)
 
 
 ### sbfiz ###
@@ -744,8 +843,16 @@ Subtract.
 
     void sub(const Register& rd,
              const Register& rn,
-             const Operand& operand,
-             FlagsUpdate S = LeaveFlags)
+             const Operand& operand)
+
+
+### subs ###
+
+Subtract and update status flags.
+
+    void subs(const Register& rd,
+              const Register& rn,
+              const Operand& operand)
 
 
 ### sxtb ###
@@ -943,11 +1050,25 @@ FP conditional select.
 
 ### fcvt ###
 
-FP convert single to double precision.
+FP convert between single and double precision.
 
     void fcvt(const FPRegister& fd, const FPRegister& fn)
 
 
+### fcvtas ###
+
+Convert FP to signed integer (nearest with ties to away).
+
+    void fcvtas(const Register& rd, const FPRegister& fn)
+
+
+### fcvtau ###
+
+Convert FP to unsigned integer (nearest with ties to away).
+
+    void fcvtau(const Register& rd, const FPRegister& fn)
+
+
 ### fcvtms ###
 
 Convert FP to signed integer (round towards -infinity).
@@ -997,6 +1118,16 @@ FP divide.
     void fdiv(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm)
 
 
+### fmadd ###
+
+FP fused multiply and add.
+
+    void fmadd(const FPRegister& fd,
+               const FPRegister& fn,
+               const FPRegister& fm,
+               const FPRegister& fa)
+
+
 ### fmax ###
 
 FP maximum.
@@ -1004,6 +1135,13 @@ FP maximum.
     void fmax(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm)
 
 
+### fmaxnm ###
+
+FP maximum number.
+
+    void fmaxnm(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm)
+
+
 ### fmin ###
 
 FP minimum.
@@ -1011,6 +1149,13 @@ FP minimum.
     void fmin(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm)
 
 
+### fminnm ###
+
+FP minimum number.
+
+    void fminnm(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm)
+
+
 ### fmov ###
 
 Move FP register to FP register.
@@ -1041,7 +1186,7 @@ Move register to FP register.
 
 ### fmsub ###
 
-FP multiply and subtract.
+FP fused multiply and subtract.
 
     void fmsub(const FPRegister& fd,
                const FPRegister& fn,
@@ -1063,6 +1208,33 @@ FP negate.
     void fneg(const FPRegister& fd, const FPRegister& fn)
 
 
+### fnmadd ###
+
+FP fused multiply, add and negate.
+
+    void fnmadd(const FPRegister& fd,
+                const FPRegister& fn,
+                const FPRegister& fm,
+                const FPRegister& fa)
+
+
+### fnmsub ###
+
+FP fused multiply, subtract and negate.
+
+    void fnmsub(const FPRegister& fd,
+                const FPRegister& fn,
+                const FPRegister& fm,
+                const FPRegister& fa)
+
+
+### frinta ###
+
+FP round to integer (nearest with ties to away).
+
+    void frinta(const FPRegister& fd, const FPRegister& fn)
+
+
 ### frintn ###
 
 FP round to integer (nearest with ties to even).
@@ -1123,11 +1295,4 @@ Emit raw instructions into the instruction stream.
     inline void dci(Instr raw_inst)
 
 
-### debug ###
-
-Debug control pseudo instruction, only supported by the debugger.
-
-    void debug(const char* message, uint32_t code, Instr params = BREAK)
-
-
 
diff --git a/src/a64/assembler-a64.cc b/src/a64/assembler-a64.cc
index 89f74063..eaee2758 100644
--- a/src/a64/assembler-a64.cc
+++ b/src/a64/assembler-a64.cc
@@ -176,28 +176,24 @@ const FPRegister& FPRegister::DRegFromCode(unsigned code) {
 
 const Register& CPURegister::W() const {
   ASSERT(IsValidRegister());
-  ASSERT(Is64Bits());
   return Register::WRegFromCode(code_);
 }
 
 
 const Register& CPURegister::X() const {
   ASSERT(IsValidRegister());
-  ASSERT(Is32Bits());
   return Register::XRegFromCode(code_);
 }
 
 
 const FPRegister& CPURegister::S() const {
   ASSERT(IsValidFPRegister());
-  ASSERT(Is64Bits());
   return FPRegister::SRegFromCode(code_);
 }
 
 
 const FPRegister& CPURegister::D() const {
   ASSERT(IsValidFPRegister());
-  ASSERT(Is32Bits());
   return FPRegister::DRegFromCode(code_);
 }
 
@@ -230,6 +226,9 @@ Operand::Operand(Register reg, Extend extend, unsigned shift_amount)
   ASSERT(reg.IsValid());
   ASSERT(shift_amount <= 4);
   ASSERT(!reg.IsSP());
+
+  // Extend modes SXTX and UXTX require a 64-bit register.
+  ASSERT(reg.Is64Bits() || ((extend != SXTX) && (extend != UXTX)));
 }
 
 
@@ -248,6 +247,15 @@ bool Operand::IsExtendedRegister() const {
 }
 
 
+bool Operand::IsZero() const {
+  if (IsImmediate()) {
+    return immediate() == 0;
+  } else {
+    return reg().IsZero();
+  }
+}
+
+
 Operand Operand::ToExtendedRegister() const {
   ASSERT(IsShiftedRegister());
   ASSERT((shift_ == LSL) && (shift_amount_ <= 4));
@@ -271,6 +279,9 @@ MemOperand::MemOperand(Register base,
   ASSERT(base.Is64Bits() && !base.IsZero());
   ASSERT(!regoffset.IsSP());
   ASSERT((extend == UXTW) || (extend == SXTW) || (extend == SXTX));
+
+  // SXTX extend mode requires a 64-bit offset register.
+  ASSERT(regoffset.Is64Bits() || (extend != SXTX));
 }
 
 
@@ -281,7 +292,7 @@ MemOperand::MemOperand(Register base,
   : base_(base), regoffset_(regoffset), offset_(0), addrmode_(Offset),
     shift_(shift), extend_(NO_EXTEND), shift_amount_(shift_amount) {
   ASSERT(base.Is64Bits() && !base.IsZero());
-  ASSERT(!regoffset.IsSP());
+  ASSERT(regoffset.Is64Bits() && !regoffset.IsSP());
   ASSERT(shift == LSL);
 }
 
@@ -303,7 +314,7 @@ MemOperand::MemOperand(Register base, const Operand& offset, AddrMode addrmode)
     offset_ = 0;
 
     // These assertions match those in the shifted-register constructor.
-    ASSERT(!regoffset_.IsSP());
+    ASSERT(regoffset_.Is64Bits() && !regoffset_.IsSP());
     ASSERT(shift_ == LSL);
   } else {
     ASSERT(offset.IsExtendedRegister());
@@ -319,6 +330,7 @@ MemOperand::MemOperand(Register base, const Operand& offset, AddrMode addrmode)
     // These assertions match those in the extended-register constructor.
     ASSERT(!regoffset_.IsSP());
     ASSERT((extend_ == UXTW) || (extend_ == SXTW) || (extend_ == SXTX));
+    ASSERT((regoffset_.Is64Bits() || (extend_ != SXTX)));
   }
 }
 
@@ -493,7 +505,7 @@ void Assembler::cbnz(const Register& rt,
 void Assembler::tbz(const Register& rt,
                     unsigned bit_pos,
                     int imm14) {
-  ASSERT(rt.Is64Bits());
+  ASSERT(rt.Is64Bits() || (rt.Is32Bits() && (bit_pos < kWRegSize)));
   Emit(TBZ | ImmTestBranchBit(bit_pos) | ImmTestBranch(imm14) | Rt(rt));
 }
 
@@ -508,7 +520,7 @@ void Assembler::tbz(const Register& rt,
 void Assembler::tbnz(const Register& rt,
                      unsigned bit_pos,
                      int imm14) {
-  ASSERT(rt.Is64Bits());
+  ASSERT(rt.Is64Bits() || (rt.Is32Bits() && (bit_pos < kWRegSize)));
   Emit(TBNZ | ImmTestBranchBit(bit_pos) | ImmTestBranch(imm14) | Rt(rt));
 }
 
@@ -533,81 +545,129 @@ void Assembler::adr(const Register& rd, Label* label) {
 
 void Assembler::add(const Register& rd,
                     const Register& rn,
-                    const Operand& operand,
-                    FlagsUpdate S) {
-  AddSub(rd, rn, operand, S, ADD);
+                    const Operand& operand) {
+  AddSub(rd, rn, operand, LeaveFlags, ADD);
+}
+
+
+void Assembler::adds(const Register& rd,
+                     const Register& rn,
+                     const Operand& operand) {
+  AddSub(rd, rn, operand, SetFlags, ADD);
 }
 
 
 void Assembler::cmn(const Register& rn,
                     const Operand& operand) {
   Register zr = AppropriateZeroRegFor(rn);
-  add(zr, rn, operand, SetFlags);
+  adds(zr, rn, operand);
 }
 
 
 void Assembler::sub(const Register& rd,
                     const Register& rn,
-                    const Operand& operand,
-                    FlagsUpdate S) {
-  AddSub(rd, rn, operand, S, SUB);
+                    const Operand& operand) {
+  AddSub(rd, rn, operand, LeaveFlags, SUB);
+}
+
+
+void Assembler::subs(const Register& rd,
+                     const Register& rn,
+                     const Operand& operand) {
+  AddSub(rd, rn, operand, SetFlags, SUB);
 }
 
 
 void Assembler::cmp(const Register& rn, const Operand& operand) {
   Register zr = AppropriateZeroRegFor(rn);
-  sub(zr, rn, operand, SetFlags);
+  subs(zr, rn, operand);
+}
+
+
+void Assembler::neg(const Register& rd, const Operand& operand) {
+  Register zr = AppropriateZeroRegFor(rd);
+  sub(rd, zr, operand);
 }
 
 
-void Assembler::neg(const Register& rd, const Operand& operand, FlagsUpdate S) {
+void Assembler::negs(const Register& rd, const Operand& operand) {
   Register zr = AppropriateZeroRegFor(rd);
-  sub(rd, zr, operand, S);
+  subs(rd, zr, operand);
 }
 
 
 void Assembler::adc(const Register& rd,
                     const Register& rn,
-                    const Operand& operand,
-                    FlagsUpdate S) {
-  AddSubWithCarry(rd, rn, operand, S, ADC);
+                    const Operand& operand) {
+  AddSubWithCarry(rd, rn, operand, LeaveFlags, ADC);
+}
+
+
+void Assembler::adcs(const Register& rd,
+                     const Register& rn,
+                     const Operand& operand) {
+  AddSubWithCarry(rd, rn, operand, SetFlags, ADC);
 }
 
 
 void Assembler::sbc(const Register& rd,
                     const Register& rn,
-                    const Operand& operand,
-                    FlagsUpdate S) {
-  AddSubWithCarry(rd, rn, operand, S, SBC);
+                    const Operand& operand) {
+  AddSubWithCarry(rd, rn, operand, LeaveFlags, SBC);
+}
+
+
+void Assembler::sbcs(const Register& rd,
+                     const Register& rn,
+                     const Operand& operand) {
+  AddSubWithCarry(rd, rn, operand, SetFlags, SBC);
+}
+
+
+void Assembler::ngc(const Register& rd, const Operand& operand) {
+  Register zr = AppropriateZeroRegFor(rd);
+  sbc(rd, zr, operand);
 }
 
 
-void Assembler::ngc(const Register& rd, const Operand& operand, FlagsUpdate S) {
+void Assembler::ngcs(const Register& rd, const Operand& operand) {
   Register zr = AppropriateZeroRegFor(rd);
-  sbc(rd, zr, operand, S);
+  sbcs(rd, zr, operand);
 }
 
 
 // Logical instructions.
 void Assembler::and_(const Register& rd,
                      const Register& rn,
-                     const Operand& operand,
-                     FlagsUpdate S) {
-  Logical(rd, rn, operand, (S == SetFlags) ? ANDS : AND);
+                     const Operand& operand) {
+  Logical(rd, rn, operand, AND);
+}
+
+
+void Assembler::ands(const Register& rd,
+                     const Register& rn,
+                     const Operand& operand) {
+  Logical(rd, rn, operand, ANDS);
 }
 
 
 void Assembler::tst(const Register& rn,
                     const Operand& operand) {
-  and_(AppropriateZeroRegFor(rn), rn, operand, SetFlags);
+  ands(AppropriateZeroRegFor(rn), rn, operand);
 }
 
 
 void Assembler::bic(const Register& rd,
                     const Register& rn,
-                    const Operand& operand,
-                    FlagsUpdate S) {
-  Logical(rd, rn, operand, (S == SetFlags) ? BICS : BIC);
+                    const Operand& operand) {
+  Logical(rd, rn, operand, BIC);
+}
+
+
+void Assembler::bics(const Register& rd,
+                     const Register& rn,
+                     const Operand& operand) {
+  Logical(rd, rn, operand, BICS);
 }
 
 
@@ -683,7 +743,7 @@ void Assembler::bfm(const Register& rd,
   ASSERT(rd.size() == rn.size());
   Instr N = SF(rd) >> (kSFOffset - kBitfieldNOffset);
   Emit(SF(rd) | BFM | N |
-       ImmR(immr, rd.size()) | ImmS(imms, rd.size()) | Rn(rn) | Rd(rd));
+       ImmR(immr, rd.size()) | ImmS(imms, rn.size()) | Rn(rn) | Rd(rd));
 }
 
 
@@ -691,10 +751,10 @@ void Assembler::sbfm(const Register& rd,
                      const Register& rn,
                      unsigned immr,
                      unsigned imms) {
-  ASSERT(rd.size() == rn.size());
+  ASSERT(rd.Is64Bits() || rn.Is32Bits());
   Instr N = SF(rd) >> (kSFOffset - kBitfieldNOffset);
   Emit(SF(rd) | SBFM | N |
-       ImmR(immr, rd.size()) | ImmS(imms, rd.size()) | Rn(rn) | Rd(rd));
+       ImmR(immr, rd.size()) | ImmS(imms, rn.size()) | Rn(rn) | Rd(rd));
 }
 
 
@@ -705,7 +765,7 @@ void Assembler::ubfm(const Register& rd,
   ASSERT(rd.size() == rn.size());
   Instr N = SF(rd) >> (kSFOffset - kBitfieldNOffset);
   Emit(SF(rd) | UBFM | N |
-       ImmR(immr, rd.size()) | ImmS(imms, rd.size()) | Rn(rn) | Rd(rd));
+       ImmR(immr, rd.size()) | ImmS(imms, rn.size()) | Rn(rn) | Rd(rd));
 }
 
 
@@ -716,7 +776,7 @@ void Assembler::extr(const Register& rd,
   ASSERT(rd.size() == rn.size());
   ASSERT(rd.size() == rm.size());
   Instr N = SF(rd) >> (kSFOffset - kBitfieldNOffset);
-  Emit(SF(rd) | EXTR | N | Rm(rm) | ImmS(lsb, rd.size()) | Rn(rn) | Rd(rd));
+  Emit(SF(rd) | EXTR | N | Rm(rm) | ImmS(lsb, rn.size()) | Rn(rn) | Rd(rd));
 }
 
 
@@ -1146,6 +1206,17 @@ void Assembler::hint(SystemHint code) {
   Emit(HINT | ImmHint(code) | Rt(xzr));
 }
 
+void Assembler::dmb(BarrierDomain domain, BarrierType type) {
+  Emit(DMB | ImmBarrierDomain(domain) | ImmBarrierType(type));
+}
+
+void Assembler::dsb(BarrierDomain domain, BarrierType type) {
+  Emit(DSB | ImmBarrierDomain(domain) | ImmBarrierType(type));
+}
+
+void Assembler::isb() {
+  Emit(ISB | ImmBarrierDomain(FullSystem) | ImmBarrierType(BarrierAll));
+}
 
 void Assembler::fmov(FPRegister fd, double imm) {
   if (fd.Is64Bits() && IsImmFP64(imm)) {
@@ -1202,6 +1273,14 @@ void Assembler::fmul(const FPRegister& fd,
 }
 
 
+void Assembler::fmadd(const FPRegister& fd,
+                      const FPRegister& fn,
+                      const FPRegister& fm,
+                      const FPRegister& fa) {
+  FPDataProcessing3Source(fd, fn, fm, fa, fd.Is32Bits() ? FMADD_s : FMADD_d);
+}
+
+
 void Assembler::fmsub(const FPRegister& fd,
                       const FPRegister& fn,
                       const FPRegister& fm,
@@ -1210,6 +1289,22 @@ void Assembler::fmsub(const FPRegister& fd,
 }
 
 
+void Assembler::fnmadd(const FPRegister& fd,
+                       const FPRegister& fn,
+                       const FPRegister& fm,
+                       const FPRegister& fa) {
+  FPDataProcessing3Source(fd, fn, fm, fa, fd.Is32Bits() ? FNMADD_s : FNMADD_d);
+}
+
+
+void Assembler::fnmsub(const FPRegister& fd,
+                       const FPRegister& fn,
+                       const FPRegister& fm,
+                       const FPRegister& fa) {
+  FPDataProcessing3Source(fd, fn, fm, fa, fd.Is32Bits() ? FNMSUB_s : FNMSUB_d);
+}
+
+
 void Assembler::fdiv(const FPRegister& fd,
                      const FPRegister& fn,
                      const FPRegister& fm) {
@@ -1224,6 +1319,13 @@ void Assembler::fmax(const FPRegister& fd,
 }
 
 
+void Assembler::fmaxnm(const FPRegister& fd,
+                       const FPRegister& fn,
+                       const FPRegister& fm) {
+  FPDataProcessing2Source(fd, fn, fm, FMAXNM);
+}
+
+
 void Assembler::fmin(const FPRegister& fd,
                      const FPRegister& fn,
                      const FPRegister& fm) {
@@ -1231,6 +1333,13 @@ void Assembler::fmin(const FPRegister& fd,
 }
 
 
+void Assembler::fminnm(const FPRegister& fd,
+                       const FPRegister& fn,
+                       const FPRegister& fm) {
+  FPDataProcessing2Source(fd, fn, fm, FMINNM);
+}
+
+
 void Assembler::fabs(const FPRegister& fd,
                      const FPRegister& fn) {
   ASSERT(fd.SizeInBits() == fn.SizeInBits());
@@ -1252,6 +1361,13 @@ void Assembler::fsqrt(const FPRegister& fd,
 }
 
 
+void Assembler::frinta(const FPRegister& fd,
+                       const FPRegister& fn) {
+  ASSERT(fd.SizeInBits() == fn.SizeInBits());
+  FPDataProcessing1Source(fd, fn, FRINTA);
+}
+
+
 void Assembler::frintn(const FPRegister& fd,
                        const FPRegister& fn) {
   ASSERT(fd.SizeInBits() == fn.SizeInBits());
@@ -1324,6 +1440,16 @@ void Assembler::fcvt(const FPRegister& fd,
 }
 
 
+void Assembler::fcvtau(const Register& rd, const FPRegister& fn) {
+  FPConvertToInt(rd, fn, FCVTAU);
+}
+
+
+void Assembler::fcvtas(const Register& rd, const FPRegister& fn) {
+  FPConvertToInt(rd, fn, FCVTAS);
+}
+
+
 void Assembler::fcvtmu(const Register& rd, const FPRegister& fn) {
   FPConvertToInt(rd, fn, FCVTMU);
 }
@@ -1334,26 +1460,22 @@ void Assembler::fcvtms(const Register& rd, const FPRegister& fn) {
 }
 
 
-void Assembler::fcvtnu(const Register& rd,
-                       const FPRegister& fn) {
+void Assembler::fcvtnu(const Register& rd, const FPRegister& fn) {
   FPConvertToInt(rd, fn, FCVTNU);
 }
 
 
-void Assembler::fcvtns(const Register& rd,
-                       const FPRegister& fn) {
+void Assembler::fcvtns(const Register& rd, const FPRegister& fn) {
   FPConvertToInt(rd, fn, FCVTNS);
 }
 
 
-void Assembler::fcvtzu(const Register& rd,
-                       const FPRegister& fn) {
+void Assembler::fcvtzu(const Register& rd, const FPRegister& fn) {
   FPConvertToInt(rd, fn, FCVTZU);
 }
 
 
-void Assembler::fcvtzs(const Register& rd,
-                       const FPRegister& fn) {
+void Assembler::fcvtzs(const Register& rd, const FPRegister& fn) {
   FPConvertToInt(rd, fn, FCVTZS);
 }
 
diff --git a/src/a64/assembler-a64.h b/src/a64/assembler-a64.h
index 93b30118..43d31590 100644
--- a/src/a64/assembler-a64.h
+++ b/src/a64/assembler-a64.h
@@ -471,6 +471,7 @@ class Operand {
   bool IsImmediate() const;
   bool IsShiftedRegister() const;
   bool IsExtendedRegister() const;
+  bool IsZero() const;
 
   // This returns an LSL shift (<= 4) operand as an equivalent extend operand,
   // which helps in the encoding of instructions that use the stack pointer.
@@ -716,8 +717,12 @@ class Assembler {
   // Add.
   void add(const Register& rd,
            const Register& rn,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+
+  // Add and update status flags.
+  void adds(const Register& rd,
+            const Register& rn,
+            const Operand& operand);
 
   // Compare negative.
   void cmn(const Register& rn, const Operand& operand);
@@ -725,40 +730,62 @@ class Assembler {
   // Subtract.
   void sub(const Register& rd,
            const Register& rn,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+
+  // Subtract and update status flags.
+  void subs(const Register& rd,
+            const Register& rn,
+            const Operand& operand);
 
   // Compare.
   void cmp(const Register& rn, const Operand& operand);
 
   // Negate.
   void neg(const Register& rd,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+
+  // Negate and update status flags.
+  void negs(const Register& rd,
+            const Operand& operand);
 
   // Add with carry bit.
   void adc(const Register& rd,
            const Register& rn,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+
+  // Add with carry bit and update status flags.
+  void adcs(const Register& rd,
+            const Register& rn,
+            const Operand& operand);
 
   // Subtract with carry bit.
   void sbc(const Register& rd,
            const Register& rn,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+
+  // Subtract with carry bit and update status flags.
+  void sbcs(const Register& rd,
+            const Register& rn,
+            const Operand& operand);
 
   // Negate with carry bit.
   void ngc(const Register& rd,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+
+  // Negate with carry bit and update status flags.
+  void ngcs(const Register& rd,
+            const Operand& operand);
 
   // Logical instructions.
   // Bitwise and (A & B).
   void and_(const Register& rd,
             const Register& rn,
-            const Operand& operand,
-            FlagsUpdate S = LeaveFlags);
+            const Operand& operand);
+
+  // Bitwise and (A & B) and update status flags.
+  void ands(const Register& rd,
+            const Register& rn,
+            const Operand& operand);
 
   // Bit test and set flags.
   void tst(const Register& rn, const Operand& operand);
@@ -766,8 +793,12 @@ class Assembler {
   // Bit clear (A & ~B).
   void bic(const Register& rd,
            const Register& rn,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+
+  // Bit clear (A & ~B) and update status flags.
+  void bics(const Register& rd,
+            const Register& rn,
+            const Operand& operand);
 
   // Bitwise or (A | B).
   void orr(const Register& rd, const Register& rn, const Operand& operand);
@@ -1160,6 +1191,15 @@ class Assembler {
   // System hint.
   void hint(SystemHint code);
 
+  // Data memory barrier.
+  void dmb(BarrierDomain domain, BarrierType type);
+
+  // Data synchronization barrier.
+  void dsb(BarrierDomain domain, BarrierType type);
+
+  // Instruction synchronization barrier.
+  void isb();
+
   // Alias for system instructions.
   // No-op.
   void nop() {
@@ -1188,12 +1228,30 @@ class Assembler {
   // FP multiply.
   void fmul(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm);
 
-  // FP multiply and subtract.
+  // FP fused multiply and add.
+  void fmadd(const FPRegister& fd,
+             const FPRegister& fn,
+             const FPRegister& fm,
+             const FPRegister& fa);
+
+  // FP fused multiply and subtract.
   void fmsub(const FPRegister& fd,
              const FPRegister& fn,
              const FPRegister& fm,
              const FPRegister& fa);
 
+  // FP fused multiply, add and negate.
+  void fnmadd(const FPRegister& fd,
+              const FPRegister& fn,
+              const FPRegister& fm,
+              const FPRegister& fa);
+
+  // FP fused multiply, subtract and negate.
+  void fnmsub(const FPRegister& fd,
+              const FPRegister& fn,
+              const FPRegister& fm,
+              const FPRegister& fa);
+
   // FP divide.
   void fdiv(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm);
 
@@ -1203,6 +1261,12 @@ class Assembler {
   // FP minimum.
   void fmin(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm);
 
+  // FP maximum number.
+  void fmaxnm(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm);
+
+  // FP minimum number.
+  void fminnm(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm);
+
   // FP absolute.
   void fabs(const FPRegister& fd, const FPRegister& fn);
 
@@ -1212,6 +1276,9 @@ class Assembler {
   // FP square root.
   void fsqrt(const FPRegister& fd, const FPRegister& fn);
 
+  // FP round to integer (nearest with ties to away).
+  void frinta(const FPRegister& fd, const FPRegister& fn);
+
   // FP round to integer (nearest with ties to even).
   void frintn(const FPRegister& fd, const FPRegister& fn);
 
@@ -1244,6 +1311,12 @@ class Assembler {
   // FP convert between single and double precision.
   void fcvt(const FPRegister& fd, const FPRegister& fn);
 
+  // Convert FP to unsigned integer (nearest with ties to away).
+  void fcvtau(const Register& rd, const FPRegister& fn);
+
+  // Convert FP to signed integer (nearest with ties to away).
+  void fcvtas(const Register& rd, const FPRegister& fn);
+
   // Convert FP to unsigned integer (round towards -infinity).
   void fcvtmu(const Register& rd, const FPRegister& fn);
 
@@ -1517,6 +1590,16 @@ class Assembler {
     return imm7 << ImmHint_offset;
   }
 
+  static Instr ImmBarrierDomain(int imm2) {
+    ASSERT(is_uint2(imm2));
+    return imm2 << ImmBarrierDomain_offset;
+  }
+
+  static Instr ImmBarrierType(int imm2) {
+    ASSERT(is_uint2(imm2));
+    return imm2 << ImmBarrierType_offset;
+  }
+
   static LSDataSize CalcLSDataSize(LoadStoreOp op) {
     ASSERT((SizeLS_offset + SizeLS_width) == (kInstructionSize * 8));
     return static_cast<LSDataSize>(op >> SizeLS_offset);
diff --git a/src/a64/constants-a64.h b/src/a64/constants-a64.h
index 2e0336dd..1ac26f4a 100644
--- a/src/a64/constants-a64.h
+++ b/src/a64/constants-a64.h
@@ -116,6 +116,8 @@ V_(ImmCmpBranch, 23, 5, SignedBits)                                            \
 V_(ImmLLiteral, 23, 5, SignedBits)                                             \
 V_(ImmException, 20, 5, Bits)                                                  \
 V_(ImmHint, 11, 5, Bits)                                                       \
+V_(ImmBarrierDomain, 11, 10, Bits)                                             \
+V_(ImmBarrierType, 9, 8, Bits)                                                 \
                                                                                \
 /* System (MRS, MSR) */                                                        \
 V_(ImmSystemRegister, 19, 5, Bits)                                             \
@@ -246,6 +248,20 @@ enum SystemHint {
   SEVL  = 5
 };
 
+enum BarrierDomain {
+  OuterShareable = 0,
+  NonShareable   = 1,
+  InnerShareable = 2,
+  FullSystem     = 3
+};
+
+enum BarrierType {
+  BarrierOther  = 0,
+  BarrierReads  = 1,
+  BarrierWrites = 2,
+  BarrierAll    = 3
+};
+
 // System/special register names.
 // This information is not encoded as one field but as the concatenation of
 // multiple fields (Op0<0>, Op1, Crn, Crm, Op2).
@@ -560,6 +576,15 @@ enum ExceptionOp {
   DCPS3          = ExceptionFixed | 0x00A00003
 };
 
+enum MemBarrierOp {
+  MemBarrierFixed = 0xD503309F,
+  MemBarrierFMask = 0xFFFFF09F,
+  MemBarrierMask  = 0xFFFFF0FF,
+  DSB             = MemBarrierFixed | 0x00000000,
+  DMB             = MemBarrierFixed | 0x00000020,
+  ISB             = MemBarrierFixed | 0x00000040
+};
+
 // Any load or store.
 enum LoadStoreAnyOp {
   LoadStoreAnyFMask = 0x0a000000,
@@ -927,17 +952,22 @@ enum FPDataProcessing1SourceOp {
   FRINTN   = FRINTN_s,
   FRINTP_s = FPDataProcessing1SourceFixed | 0x00048000,
   FRINTP_d = FPDataProcessing1SourceFixed | FP64 | 0x00048000,
+  FRINTP   = FRINTP_s,
   FRINTM_s = FPDataProcessing1SourceFixed | 0x00050000,
   FRINTM_d = FPDataProcessing1SourceFixed | FP64 | 0x00050000,
+  FRINTM   = FRINTM_s,
   FRINTZ_s = FPDataProcessing1SourceFixed | 0x00058000,
   FRINTZ_d = FPDataProcessing1SourceFixed | FP64 | 0x00058000,
   FRINTZ   = FRINTZ_s,
   FRINTA_s = FPDataProcessing1SourceFixed | 0x00060000,
   FRINTA_d = FPDataProcessing1SourceFixed | FP64 | 0x00060000,
+  FRINTA   = FRINTA_s,
   FRINTX_s = FPDataProcessing1SourceFixed | 0x00070000,
   FRINTX_d = FPDataProcessing1SourceFixed | FP64 | 0x00070000,
+  FRINTX   = FRINTX_s,
   FRINTI_s = FPDataProcessing1SourceFixed | 0x00078000,
-  FRINTI_d = FPDataProcessing1SourceFixed | FP64 | 0x00078000
+  FRINTI_d = FPDataProcessing1SourceFixed | FP64 | 0x00078000,
+  FRINTI   = FRINTI_s
 };
 
 // Floating point data processing 2 source.
diff --git a/src/a64/debugger-a64.cc b/src/a64/debugger-a64.cc
index ecdc8355..c146a452 100644
--- a/src/a64/debugger-a64.cc
+++ b/src/a64/debugger-a64.cc
@@ -35,7 +35,7 @@ C(ContinueCommand)             \
 C(StepCommand)                 \
 C(DisasmCommand)               \
 C(PrintCommand)                \
-C(MemCommand)
+C(ExamineCommand)
 
 // Debugger command lines are broken up in token of different type to make
 // processing easier later on.
@@ -173,26 +173,22 @@ class IntegerToken : public ValueToken<int64_t> {
 };
 
 // Literal describing how to print a chunk of data (up to 64 bits).
-// Format: %qt
-// where q (qualifier) is one of
+// Format: .ln
+// where l (letter) is one of
+//  * x: hexadecimal
 //  * s: signed integer
 //  * u: unsigned integer
-//  * a: hexadecimal floating point
-// and t (type) is one of
-//  * x: 64-bit integer
-//  * w: 32-bit integer
-//  * h: 16-bit integer
-//  * b: 8-bit integer
-//  * c: character
-//  * d: double
-//  * s: float
-// When no qualifier is given for integers, they are printed in hexadecinal.
+//  * f: floating point
+//  * i: instruction
+// and n (size) is one of 8, 16, 32 and 64. n should be omitted for
+// instructions.
 class FormatToken : public Token {
  public:
   FormatToken() {}
 
   virtual bool IsFormat() const { return true; }
   virtual int SizeOf() const = 0;
+  virtual char type_code() const = 0;
   virtual void PrintData(void* data, FILE* out = stdout) const = 0;
   virtual void Print(FILE* out = stdout) const = 0;
 
@@ -206,9 +202,10 @@ class FormatToken : public Token {
 
 template<typename T> class Format : public FormatToken {
  public:
-  explicit Format(const char* fmt) : fmt_(fmt) {}
+  Format(const char* fmt, char type_code) : fmt_(fmt), type_code_(type_code) {}
 
   virtual int SizeOf() const { return sizeof(T); }
+  virtual char type_code() const { return type_code_; }
   virtual void PrintData(void* data, FILE* out = stdout) const {
     T value;
     memcpy(&value, data, sizeof(value));
@@ -218,6 +215,7 @@ template<typename T> class Format : public FormatToken {
 
  private:
   const char* fmt_;
+  char type_code_;
 };
 
 // Tokens which don't fit any of the above.
@@ -314,37 +312,25 @@ class StepCommand : public DebugCommand {
 
 class DisasmCommand : public DebugCommand {
  public:
-  DisasmCommand(Token* name, Token* target, IntegerToken* count)
-      : DebugCommand(name), target_(target), count_(count) {}
-  virtual ~DisasmCommand() {
-    delete target_;
-    delete count_;
-  }
-
-  Token* target() { return target_; }
-  int64_t count() { return count_->value(); }
-  virtual bool Run(Debugger* debugger);
-  virtual void Print(FILE* out = stdout);
-
   static DebugCommand* Build(std::vector<Token*> args);
 
   static const char* kHelp;
   static const char* kAliases[];
   static const char* kArguments;
-
- private:
-  Token* target_;
-  IntegerToken* count_;
 };
 
 
 class PrintCommand : public DebugCommand {
  public:
-  PrintCommand(Token* name, Token* target)
-      : DebugCommand(name), target_(target) {}
-  virtual ~PrintCommand() { delete target_; }
+  PrintCommand(Token* name, Token* target, FormatToken* format)
+      : DebugCommand(name), target_(target), format_(format) {}
+  virtual ~PrintCommand() {
+    delete target_;
+    delete format_;
+  }
 
   Token* target() { return target_; }
+  FormatToken* format() { return format_; }
   virtual bool Run(Debugger* debugger);
   virtual void Print(FILE* out = stdout);
 
@@ -356,24 +342,25 @@ class PrintCommand : public DebugCommand {
 
  private:
   Token* target_;
+  FormatToken* format_;
 };
 
-class MemCommand : public DebugCommand {
+class ExamineCommand : public DebugCommand {
  public:
-  MemCommand(Token* name,
-             Token* target,
-             IntegerToken* count,
-             FormatToken* format)
-      : DebugCommand(name), target_(target), count_(count), format_(format) {}
-  virtual ~MemCommand() {
+  ExamineCommand(Token* name,
+                 Token* target,
+                 FormatToken* format,
+                 IntegerToken* count)
+      : DebugCommand(name), target_(target), format_(format), count_(count) {}
+  virtual ~ExamineCommand() {
     delete target_;
-    delete count_;
     delete format_;
+    delete count_;
   }
 
   Token* target() { return target_; }
-  int64_t count() { return count_->value(); }
   FormatToken* format() { return format_; }
+  IntegerToken* count() { return count_; }
   virtual bool Run(Debugger* debugger);
   virtual void Print(FILE* out = stdout);
 
@@ -385,8 +372,8 @@ class MemCommand : public DebugCommand {
 
  private:
   Token* target_;
-  IntegerToken* count_;
   FormatToken* format_;
+  IntegerToken* count_;
 };
 
 // Commands which name does not match any of the known commnand.
@@ -418,40 +405,48 @@ class InvalidCommand : public DebugCommand {
 
 const char* HelpCommand::kAliases[] = { "help", NULL };
 const char* HelpCommand::kArguments = NULL;
-const char* HelpCommand::kHelp = "  print this help";
+const char* HelpCommand::kHelp = "  Print this help.";
 
 const char* ContinueCommand::kAliases[] = { "continue", "c", NULL };
 const char* ContinueCommand::kArguments = NULL;
-const char* ContinueCommand::kHelp = "  resume execution";
+const char* ContinueCommand::kHelp = "  Resume execution.";
 
 const char* StepCommand::kAliases[] = { "stepi", "si", NULL };
 const char* StepCommand::kArguments = "[n = 1]";
-const char* StepCommand::kHelp = "  execute n next instruction(s)";
+const char* StepCommand::kHelp = "  Execute n next instruction(s).";
 
-const char* DisasmCommand::kAliases[] = { "dis", "d", NULL };
-const char* DisasmCommand::kArguments = "[addr = pc] [n = 1]";
+const char* DisasmCommand::kAliases[] = { "disasm", "di", NULL };
+const char* DisasmCommand::kArguments = "[n = 10]";
 const char* DisasmCommand::kHelp =
-  "  disassemble n instruction(s) at address addr.\n"
-  "  addr can be an immediate address, a register or the pc."
+  "  Disassemble n instruction(s) at pc.\n"
+  "  This command is equivalent to x pc.i [n = 10]."
 ;
 
 const char* PrintCommand::kAliases[] = { "print", "p", NULL };
-const char* PrintCommand::kArguments =  "<entity>";
+const char* PrintCommand::kArguments =  "<entity>[.format]";
 const char* PrintCommand::kHelp =
-  "  print the given entity\n"
-  "  entity can be 'regs' for W and X registers, 'fpregs' for S and D\n"
-  "  registers, 'sysregs' for system registers (including NZCV) or 'pc'."
+  "  Print the given entity according to the given format.\n"
+  "  The format parameter only affects individual registers; it is ignored\n"
+  "  for other entities.\n"
+  "  <entity> can be one of the following:\n"
+  "   * A register name (such as x0, s1, ...).\n"
+  "   * 'regs', to print all integer (W and X) registers.\n"
+  "   * 'fpregs' to print all floating-point (S and D) registers.\n"
+  "   * 'sysregs' to print all system registers (including NZCV).\n"
+  "   * 'pc' to print the current program counter.\n"
 ;
 
-const char* MemCommand::kAliases[] = { "mem", "m", NULL };
-const char* MemCommand::kArguments = "<addr> [n = 1] [format = %x]";
-const char* MemCommand::kHelp =
-  "  print n memory item(s) at address addr according to the given format.\n"
-  "  addr can be an immediate address, a register or the pc.\n"
-  "  format is made of a qualifer: 's', 'u', 'a' (signed, unsigned, hexa)\n"
-  "  and a type 'x', 'w', 'h', 'b' (64- to 8-bit integer), 'c' (character),\n"
-  "  's' (float) or 'd' (double). E.g 'mem sp %w' will print a 32-bit word\n"
-  "  from the stack as an hexadecimal number."
+const char* ExamineCommand::kAliases[] = { "m", "mem", "x", NULL };
+const char* ExamineCommand::kArguments = "<addr>[.format] [n = 10]";
+const char* ExamineCommand::kHelp =
+  "  Examine memory. Print n items of memory at address <addr> according to\n"
+  "  the given [.format].\n"
+  "  Addr can be an immediate address, a register name or pc.\n"
+  "  Format is made of a type letter: 'x' (hexadecimal), 's' (signed), 'u'\n"
+  "  (unsigned), 'f' (floating point), i (instruction) and a size in bits\n"
+  "  when appropriate (8, 16, 32, 64)\n"
+  "  E.g 'x sp.x64' will print 10 64-bit words from the stack in\n"
+  "  hexadecimal format."
 ;
 
 const char* RegisterToken::kXAliases[kNumberOfRegisters][kMaxAliasNumber] = {
@@ -539,6 +534,7 @@ Debugger::Debugger(Decoder* decoder, FILE* stream)
 
 
 void Debugger::Run() {
+  pc_modified_ = false;
   while (pc_ != kEndOfSimAddress) {
     if (pending_request()) {
       LogProcessorState();
@@ -571,8 +567,8 @@ void Debugger::PrintInstructions(void* address, int64_t count) {
 
 
 void Debugger::PrintMemory(const uint8_t* address,
-                           int64_t count,
-                           const FormatToken* format) {
+                           const FormatToken* format,
+                           int64_t count) {
   if (count == 0) {
     return;
   }
@@ -586,7 +582,7 @@ void Debugger::PrintMemory(const uint8_t* address,
   const uint8_t* to = from + count * size;
 
   for (const uint8_t* current = from; current < to; current += size) {
-    if (((current - from) % 16) == 0) {
+    if (((current - from) % 8) == 0) {
       printf("\n%p: ", current);
     }
 
@@ -598,6 +594,54 @@ void Debugger::PrintMemory(const uint8_t* address,
 }
 
 
+void Debugger::PrintRegister(const Register& target_reg,
+                             const char* name,
+                             const FormatToken* format) {
+  const uint64_t reg_size = target_reg.SizeInBits();
+  const uint64_t format_size = format->SizeOf() * 8;
+  const uint64_t count = reg_size / format_size;
+  const uint64_t mask = 0xffffffffffffffff >> (64 - format_size);
+  const uint64_t reg_value = reg<uint64_t>(reg_size,
+                                           target_reg.code(),
+                                           Reg31IsStackPointer);
+  ASSERT(count > 0);
+
+  printf("%s = ", name);
+  for (uint64_t i = 1; i <= count; i++) {
+    uint64_t data = reg_value >> (reg_size - (i * format_size));
+    data &= mask;
+    format->PrintData(&data);
+    printf(" ");
+  }
+  printf("\n");
+}
+
+
+void Debugger::PrintFPRegister(const FPRegister& target_fpreg,
+                               const FormatToken* format) {
+  const uint64_t fpreg_size = target_fpreg.SizeInBits();
+  const uint64_t format_size = format->SizeOf() * 8;
+  const uint64_t count = fpreg_size / format_size;
+  const uint64_t mask = 0xffffffffffffffff >> (64 - format_size);
+  const uint64_t fpreg_value = fpreg<uint64_t>(fpreg_size,
+                                               target_fpreg.code());
+  ASSERT(count > 0);
+
+  if (target_fpreg.Is32Bits()) {
+    printf("s%u = ", target_fpreg.code());
+  } else {
+    printf("d%u = ", target_fpreg.code());
+  }
+  for (uint64_t i = 1; i <= count; i++) {
+    uint64_t data = fpreg_value >> (fpreg_size - (i * format_size));
+    data &= mask;
+    format->PrintData(&data);
+    printf(" ");
+  }
+  printf("\n");
+}
+
+
 void Debugger::VisitException(Instruction* instr) {
   switch (instr->Mask(ExceptionMask)) {
     case BRK:
@@ -873,11 +917,6 @@ Token* Token::Tokenize(const char* arg) {
     return token;
   }
 
-  token = FormatToken::Tokenize(arg);
-  if (token != NULL) {
-    return token;
-  }
-
   return new UnknownToken(arg);
 }
 
@@ -1039,61 +1078,79 @@ Token* IntegerToken::Tokenize(const char* arg) {
 
 
 Token* FormatToken::Tokenize(const char* arg) {
-  if (arg[0] != '%') {
+  int length = strlen(arg);
+  switch (arg[0]) {
+    case 'x':
+    case 's':
+    case 'u':
+    case 'f':
+      if (length == 1) return NULL;
+      break;
+    case 'i':
+      if (length == 1) return new Format<uint32_t>("%08" PRIx32, 'i');
+    default: return NULL;
+  }
+
+  char* endptr = NULL;
+  errno = 0;  // Reset errors.
+  uint64_t count = strtoul(arg + 1, &endptr, 10);
+
+  if (errno != 0) {
+    // Overflow, etc.
     return NULL;
   }
 
-  int length = strlen(arg);
-  if ((length < 2) || (length > 3)) {
+  if (endptr == arg) {
+    // No digits were parsed.
     return NULL;
   }
 
-  char type = arg[length - 1];
-  if (length == 2) {
-    switch (type) {
-      case 'x': return new Format<uint64_t>("%016" PRIx64);
-      case 'w': return new Format<uint32_t>("%08" PRIx32);
-      case 'h': return new Format<uint16_t>("%04" PRIx16);
-      case 'b': return new Format<uint8_t>("%02" PRIx8);
-      case 'c': return new Format<char>("%c");
-      case 'd': return new Format<double>("%g");
-      case 's': return new Format<float>("%g");
-      default: return NULL;
-    }
+  if (*endptr != '\0') {
+    // There are unexpected (non-digit) characters after the number.
+    return NULL;
   }
 
-  ASSERT(length == 3);
-  switch (arg[1]) {
+  switch (arg[0]) {
+    case 'x':
+      switch (count) {
+        case 8: return new Format<uint8_t>("%02" PRIx8, 'x');
+        case 16: return new Format<uint16_t>("%04" PRIx16, 'x');
+        case 32: return new Format<uint32_t>("%08" PRIx32, 'x');
+        case 64: return new Format<uint64_t>("%016" PRIx64, 'x');
+        default: return NULL;
+      }
     case 's':
-      switch (type) {
-        case 'x': return new Format<int64_t>("%+20" PRId64);
-        case 'w': return new Format<int32_t>("%+11" PRId32);
-        case 'h': return new Format<int16_t>("%+6" PRId16);
-        case 'b': return new Format<int8_t>("%+4" PRId8);
+      switch (count) {
+        case 8: return new Format<int8_t>("%4" PRId8, 's');
+        case 16: return new Format<int16_t>("%6" PRId16, 's');
+        case 32: return new Format<int32_t>("%11" PRId32, 's');
+        case 64: return new Format<int64_t>("%20" PRId64, 's');
         default: return NULL;
       }
     case 'u':
-      switch (type) {
-        case 'x': return new Format<uint64_t>("%20" PRIu64);
-        case 'w': return new Format<uint32_t>("%10" PRIu32);
-        case 'h': return new Format<uint16_t>("%5" PRIu16);
-        case 'b': return new Format<uint8_t>("%3" PRIu8);
+      switch (count) {
+        case 8: return new Format<uint8_t>("%3" PRIu8, 'u');
+        case 16: return new Format<uint16_t>("%5" PRIu16, 'u');
+        case 32: return new Format<uint32_t>("%10" PRIu32, 'u');
+        case 64: return new Format<uint64_t>("%20" PRIu64, 'u');
         default: return NULL;
       }
-    case 'a':
-      switch (type) {
-        case 'd': return new Format<double>("%a");
-        case 's': return new Format<float>("%a");
+    case 'f':
+      switch (count) {
+        case 32: return new Format<float>("%13g", 'f');
+        case 64: return new Format<double>("%13g", 'f');
         default: return NULL;
       }
-    default: return NULL;
+    default:
+      UNREACHABLE();
+      return NULL;
   }
 }
 
 
 template<typename T>
 void Format<T>::Print(FILE* out) const {
-  fprintf(out, "[Format %s - %lu byte(s)]", fmt_, sizeof(T));
+  fprintf(out, "[Format %c%lu - %s]", type_code_, sizeof(T) * 8, fmt_);
 }
 
 
@@ -1121,10 +1178,25 @@ bool DebugCommand::Match(const char* name, const char** aliases) {
 DebugCommand* DebugCommand::Parse(char* line) {
   std::vector<Token*> args;
 
-  for (char* chunk = strtok(line, " ");
+  for (char* chunk = strtok(line, " \t");
        chunk != NULL;
-       chunk = strtok(NULL, " ")) {
-    args.push_back(Token::Tokenize(chunk));
+       chunk = strtok(NULL, " \t")) {
+    char* dot = strchr(chunk, '.');
+    if (dot != NULL) {
+      // 'Token.format'.
+      Token* format = FormatToken::Tokenize(dot + 1);
+      if (format != NULL) {
+        *dot = '\0';
+        args.push_back(Token::Tokenize(chunk));
+        args.push_back(format);
+      } else {
+        // Error while parsing the format, push the UnknownToken so an error
+        // can be accurately reported.
+        args.push_back(Token::Tokenize(chunk));
+      }
+    } else {
+      args.push_back(Token::Tokenize(chunk));
+    }
   }
 
   if (args.size() == 0) {
@@ -1132,7 +1204,7 @@ DebugCommand* DebugCommand::Parse(char* line) {
   }
 
   if (!args[0]->IsIdentifier()) {
-    return new InvalidCommand(args, 0, "command name is not an identifier");
+    return new InvalidCommand(args, 0, "command name is not valid");
   }
 
   const char* name = IdentifierToken::Cast(args[0])->value();
@@ -1249,66 +1321,36 @@ DebugCommand* StepCommand::Build(std::vector<Token*> args) {
 }
 
 
-bool DisasmCommand::Run(Debugger* debugger) {
-  ASSERT(debugger->IsDebuggerRunning());
-
-  uint8_t* from = target()->ToAddress(debugger);
-  debugger->PrintInstructions(from, count());
-
-  return false;
-}
-
-
-void DisasmCommand::Print(FILE* out) {
-  fprintf(out, "%s ", name());
-  target()->Print(out);
-  fprintf(out, " %" PRId64 "", count());
-}
-
-
 DebugCommand* DisasmCommand::Build(std::vector<Token*> args) {
-  Token* address = NULL;
   IntegerToken* count = NULL;
   switch (args.size()) {
-    case 1: {  // disasm [pc] [1]
-      address = new IdentifierToken("pc");
-      count = new IntegerToken(1);
-      break;
-    }
-    case 2: {  // disasm [pc] n or disasm address [1]
-      Token* first = args[1];
-      if (first->IsInteger()) {
-        address = new IdentifierToken("pc");
-        count = IntegerToken::Cast(first);
-      } else if (first->CanAddressMemory()) {
-        address = first;
-        count = new IntegerToken(1);
-      } else {
-        return new InvalidCommand(args, 1, "expects int or addr");
-      }
+    case 1: {  // disasm [10]
+      count = new IntegerToken(10);
       break;
     }
-    case 3: {  // disasm address count
+    case 2: {  // disasm n
       Token* first = args[1];
-      Token* second = args[2];
-      if (!first->CanAddressMemory() || !second->IsInteger()) {
-        return new InvalidCommand(args, -1, "disasm addr int");
+      if (!first->IsInteger()) {
+        return new InvalidCommand(args, 1, "expects int");
       }
-      address = first;
-      count = IntegerToken::Cast(second);
+
+      count = IntegerToken::Cast(first);
       break;
     }
     default:
-      return new InvalidCommand(args, -1, "wrong arguments number");
+      return new InvalidCommand(args, -1, "too many arguments");
   }
 
-  return new DisasmCommand(args[0], address, count);
+  Token* target = new IdentifierToken("pc");
+  FormatToken* format = new Format<uint32_t>("%08" PRIx32, 'i');
+  return new ExamineCommand(args[0], target, format, count);
 }
 
 
 void PrintCommand::Print(FILE* out) {
   fprintf(out, "%s ", name());
   target()->Print(out);
+  if (format() != NULL) format()->Print(out);
 }
 
 
@@ -1333,30 +1375,24 @@ bool PrintCommand::Run(Debugger* debugger) {
     return false;
   }
 
+  FormatToken* format_tok = format();
+  ASSERT(format_tok != NULL);
+  if (format_tok->type_code() == 'i') {
+    // TODO(all): Add support for instruction disassembly.
+    printf(" ** unsupported format: instructions **\n");
+    return false;
+  }
+
   if (tok->IsRegister()) {
     RegisterToken* reg_tok = RegisterToken::Cast(tok);
     Register reg = reg_tok->value();
-    if (reg.Is32Bits()) {
-      printf("%s = %" PRId32 "\n",
-             reg_tok->Name(),
-             debugger->wreg(reg.code(), Reg31IsStackPointer));
-    } else {
-      printf("%s = %" PRId64 "\n",
-             reg_tok->Name(),
-             debugger->xreg(reg.code(), Reg31IsStackPointer));
-    }
-
+    debugger->PrintRegister(reg, reg_tok->Name(), format_tok);
     return false;
   }
 
   if (tok->IsFPRegister()) {
     FPRegister fpreg = FPRegisterToken::Cast(tok)->value();
-    if (fpreg.Is32Bits()) {
-      printf("s%u = %g\n", fpreg.code(), debugger->sreg(fpreg.code()));
-    } else {
-      printf("d%u = %g\n", fpreg.code(), debugger->dreg(fpreg.code()));
-    }
-
+    debugger->PrintFPRegister(fpreg, format_tok);
     return false;
   }
 
@@ -1366,91 +1402,144 @@ bool PrintCommand::Run(Debugger* debugger) {
 
 
 DebugCommand* PrintCommand::Build(std::vector<Token*> args) {
-  Token* target = NULL;
+  if (args.size() < 2) {
+    return new InvalidCommand(args, -1, "too few arguments");
+  }
+
+  Token* target = args[1];
+  if (!target->IsRegister() &&
+      !target->IsFPRegister() &&
+      !target->IsIdentifier()) {
+    return new InvalidCommand(args, 1, "expects reg or identifier");
+  }
+
+  FormatToken* format = NULL;
+  int target_size = 0;
+  if (target->IsRegister()) {
+    Register reg = RegisterToken::Cast(target)->value();
+    target_size = reg.SizeInBytes();
+  } else if (target->IsFPRegister()) {
+    FPRegister fpreg = FPRegisterToken::Cast(target)->value();
+    target_size = fpreg.SizeInBytes();
+  }
+  // If the target is an identifier there must be no format. This is checked
+  // in the switch statement below.
+
   switch (args.size()) {
     case 2: {
-      target = args[1];
-      if (!target->IsRegister()
-          && !target->IsFPRegister()
-          && !target->IsIdentifier()) {
-        return new InvalidCommand(args, 1, "expects reg or identifier");
+      if (target->IsRegister()) {
+        switch (target_size) {
+          case 4: format = new Format<uint32_t>("%08" PRIx32, 'x'); break;
+          case 8: format = new Format<uint64_t>("%016" PRIx64, 'x'); break;
+          default: UNREACHABLE();
+        }
+      } else if (target->IsFPRegister()) {
+        switch (target_size) {
+          case 4: format = new Format<float>("%8g", 'f'); break;
+          case 8: format = new Format<double>("%8g", 'f'); break;
+          default: UNREACHABLE();
+        }
       }
       break;
     }
+    case 3: {
+      if (target->IsIdentifier()) {
+        return new InvalidCommand(args, 2,
+            "format is only allowed with registers");
+      }
+
+      Token* second = args[2];
+      if (!second->IsFormat()) {
+        return new InvalidCommand(args, 2, "expects format");
+      }
+      format = FormatToken::Cast(second);
+
+      if (format->SizeOf() > target_size) {
+        return new InvalidCommand(args, 2, "format too wide");
+      }
+
+      break;
+    }
     default:
       return new InvalidCommand(args, -1, "too many arguments");
   }
 
-  return new PrintCommand(args[0], target);
+  return new PrintCommand(args[0], target, format);
 }
 
 
-bool MemCommand::Run(Debugger* debugger) {
+bool ExamineCommand::Run(Debugger* debugger) {
   ASSERT(debugger->IsDebuggerRunning());
 
   uint8_t* address = target()->ToAddress(debugger);
-  debugger->PrintMemory(address, count(), format());
+  int64_t  amount = count()->value();
+  if (format()->type_code() == 'i') {
+    debugger->PrintInstructions(address, amount);
+  } else {
+    debugger->PrintMemory(address, format(), amount);
+  }
 
   return false;
 }
 
 
-void MemCommand::Print(FILE* out) {
+void ExamineCommand::Print(FILE* out) {
   fprintf(out, "%s ", name());
-  target()->Print(out);
-  fprintf(out, " %" PRId64 " ", count());
   format()->Print(out);
+  target()->Print(out);
 }
 
 
-DebugCommand* MemCommand::Build(std::vector<Token*> args) {
+DebugCommand* ExamineCommand::Build(std::vector<Token*> args) {
   if (args.size() < 2) {
     return new InvalidCommand(args, -1, "too few arguments");
   }
 
   Token* target = args[1];
-  IntegerToken* count = NULL;
-  FormatToken* format = NULL;
-
   if (!target->CanAddressMemory()) {
     return new InvalidCommand(args, 1, "expects address");
   }
 
+  FormatToken* format = NULL;
+  IntegerToken* count = NULL;
+
   switch (args.size()) {
-    case 2: {  // mem addressable [1] [%x]
-      count = new IntegerToken(1);
-      format = new Format<uint64_t>("%016x");
+    case 2: {  // mem addr[.x64] [10]
+      format = new Format<uint64_t>("%016" PRIx64, 'x');
+      count = new IntegerToken(10);
       break;
     }
-    case 3: {  // mem addr n [%x] or mem addr [n] %f
+    case 3: {  // mem addr.format [10]
+               // mem addr[.x64] n
       Token* second = args[2];
-      if (second->IsInteger()) {
-        count = IntegerToken::Cast(second);
-        format = new Format<uint64_t>("%016x");
-      } else if (second->IsFormat()) {
-        count = new IntegerToken(1);
+      if (second->IsFormat()) {
         format = FormatToken::Cast(second);
+        count = new IntegerToken(10);
+        break;
+      } else if (second->IsInteger()) {
+        format = new Format<uint64_t>("%016" PRIx64, 'x');
+        count = IntegerToken::Cast(second);
       } else {
-        return new InvalidCommand(args, 2, "expects int or format");
+        return new InvalidCommand(args, 2, "expects format or integer");
       }
+      UNREACHABLE();
       break;
     }
-    case 4: {  // mem addr n %f
+    case 4: {  // mem addr.format n
       Token* second = args[2];
       Token* third = args[3];
-      if (!second->IsInteger() || !third->IsFormat()) {
-        return new InvalidCommand(args, -1, "mem addr >>int<< %F");
+      if (!second->IsFormat() || !third->IsInteger()) {
+        return new InvalidCommand(args, -1, "expects addr[.format] [n]");
       }
-
-      count = IntegerToken::Cast(second);
-      format = FormatToken::Cast(third);
+      format = FormatToken::Cast(second);
+      count = IntegerToken::Cast(third);
       break;
     }
     default:
       return new InvalidCommand(args, -1, "too many arguments");
   }
 
-  return new MemCommand(args[0], target, count, format);
+  return new ExamineCommand(args[0], target, format, count);
 }
 
 
diff --git a/src/a64/debugger-a64.h b/src/a64/debugger-a64.h
index 542d2025..1f7888a6 100644
--- a/src/a64/debugger-a64.h
+++ b/src/a64/debugger-a64.h
@@ -156,8 +156,13 @@ class Debugger : public Simulator {
 
   void PrintInstructions(void* address, int64_t count = 1);
   void PrintMemory(const uint8_t* address,
-                   int64_t count,
-                   const FormatToken* format);
+                   const FormatToken* format,
+                   int64_t count = 1);
+  void PrintRegister(const Register& target_reg,
+                     const char* name,
+                     const FormatToken* format);
+  void PrintFPRegister(const FPRegister& target_fpreg,
+                       const FormatToken* format);
 
  private:
   void LogSystemRegisters();
diff --git a/src/a64/disasm-a64.cc b/src/a64/disasm-a64.cc
index 4a497480..616fca56 100644
--- a/src/a64/disasm-a64.cc
+++ b/src/a64/disasm-a64.cc
@@ -1082,6 +1082,14 @@ void Disassembler::VisitFPIntegerConvert(Instruction* instr) {
     case FMOV_xd: mnemonic = "fmov"; form = form_rf; break;
     case FMOV_sw:
     case FMOV_dx: mnemonic = "fmov"; form = form_fr; break;
+    case FCVTAS_ws:
+    case FCVTAS_xs:
+    case FCVTAS_wd:
+    case FCVTAS_xd: mnemonic = "fcvtas"; form = form_rf; break;
+    case FCVTAU_ws:
+    case FCVTAU_xs:
+    case FCVTAU_wd:
+    case FCVTAU_xd: mnemonic = "fcvtau"; form = form_rf; break;
     case FCVTMS_ws:
     case FCVTMS_xs:
     case FCVTMS_wd:
@@ -1184,6 +1192,24 @@ void Disassembler::VisitSystem(Instruction* instr) {
         break;
       }
     }
+  } else if (instr->Mask(MemBarrierFMask) == MemBarrierFixed) {
+    switch (instr->Mask(MemBarrierMask)) {
+      case DMB: {
+        mnemonic = "dmb";
+        form = "'M";
+        break;
+      }
+      case DSB: {
+        mnemonic = "dsb";
+        form = "'M";
+        break;
+      }
+      case ISB: {
+        mnemonic = "isb";
+        form = NULL;
+        break;
+      }
+    }
   }
 
   Format(instr, mnemonic, form);
@@ -1268,6 +1294,7 @@ int Disassembler::SubstituteField(Instruction* instr, const char* format) {
     case 'A': return SubstitutePCRelAddressField(instr, format);
     case 'B': return SubstituteBranchTargetField(instr, format);
     case 'O': return SubstituteLSRegOffsetField(instr, format);
+    case 'M': return SubstituteBarrierField(instr, format);
     default: {
       UNREACHABLE();
       return 1;
@@ -1654,6 +1681,23 @@ int Disassembler::SubstitutePrefetchField(Instruction* instr,
   return 6;
 }
 
+int Disassembler::SubstituteBarrierField(Instruction* instr,
+                                         const char* format) {
+  ASSERT(format[0] == 'M');
+  USE(format);
+
+  static const char* options[4][4] = {
+    { "sy (0b0000)", "oshld", "oshst", "osh" },
+    { "sy (0b0100)", "nshld", "nshst", "nsh" },
+    { "sy (0b1000)", "ishld", "ishst", "ish" },
+    { "sy (0b1100)", "ld", "st", "sy" }
+  };
+  int domain = instr->ImmBarrierDomain();
+  int type = instr->ImmBarrierType();
+
+  AppendToOutput("%s", options[domain][type]);
+  return 1;
+}
 
 void Disassembler::ResetOutput() {
   buffer_pos_ = 0;
diff --git a/src/a64/disasm-a64.h b/src/a64/disasm-a64.h
index 857a5aca..3a56e155 100644
--- a/src/a64/disasm-a64.h
+++ b/src/a64/disasm-a64.h
@@ -64,6 +64,7 @@ class Disassembler: public DecoderVisitor {
   int SubstituteBranchTargetField(Instruction* instr, const char* format);
   int SubstituteLSRegOffsetField(Instruction* instr, const char* format);
   int SubstitutePrefetchField(Instruction* instr, const char* format);
+  int SubstituteBarrierField(Instruction* instr, const char* format);
 
   inline bool RdIsZROrSP(Instruction* instr) const {
     return (instr->Rd() == kZeroRegCode);
diff --git a/src/a64/instructions-a64.h b/src/a64/instructions-a64.h
index 0f31fcd7..a7558b2a 100644
--- a/src/a64/instructions-a64.h
+++ b/src/a64/instructions-a64.h
@@ -93,6 +93,7 @@ static const float kFP32SignallingNaN = rawbits_to_float(0x7f800001);
 static const double kFP64QuietNaN = rawbits_to_double(0x7ff800007fc00001);
 static const float kFP32QuietNaN = rawbits_to_float(0x7fc00001);
 
+
 enum LSDataSize {
   LSByte        = 0,
   LSHalfword    = 1,
diff --git a/src/a64/instrument-a64.cc b/src/a64/instrument-a64.cc
index 507410d0..841173c3 100644
--- a/src/a64/instrument-a64.cc
+++ b/src/a64/instrument-a64.cc
@@ -151,7 +151,7 @@ Instrument::~Instrument() {
   // Free all the counter objects.
   std::list<Counter*>::iterator it;
   for (it = counters_.begin(); it != counters_.end(); it++) {
-    free(*it);
+    delete *it;
   }
 
   if (output_stream_ != stdout) {
diff --git a/src/a64/instrument-a64.h b/src/a64/instrument-a64.h
index bee965ba..d8ddb46b 100644
--- a/src/a64/instrument-a64.h
+++ b/src/a64/instrument-a64.h
@@ -54,7 +54,6 @@ enum CounterType {
 class Counter {
  public:
   Counter(const char* name, CounterType type = Gauge);
-  ~Counter();
 
   void Increment();
   void Enable();
diff --git a/src/a64/macro-assembler-a64.cc b/src/a64/macro-assembler-a64.cc
index 39a925c2..a7e2c2ef 100644
--- a/src/a64/macro-assembler-a64.cc
+++ b/src/a64/macro-assembler-a64.cc
@@ -29,26 +29,40 @@ namespace vixl {
 
 void MacroAssembler::And(const Register& rd,
                          const Register& rn,
-                         const Operand& operand,
-                         FlagsUpdate S) {
+                         const Operand& operand) {
   ASSERT(allow_macro_instructions_);
-  LogicalMacro(rd, rn, operand, (S == SetFlags) ? ANDS : AND);
+  LogicalMacro(rd, rn, operand, AND);
+}
+
+
+void MacroAssembler::Ands(const Register& rd,
+                          const Register& rn,
+                          const Operand& operand) {
+  ASSERT(allow_macro_instructions_);
+  LogicalMacro(rd, rn, operand, ANDS);
 }
 
 
 void MacroAssembler::Tst(const Register& rn,
                          const Operand& operand) {
   ASSERT(allow_macro_instructions_);
-  And(AppropriateZeroRegFor(rn), rn, operand, SetFlags);
+  Ands(AppropriateZeroRegFor(rn), rn, operand);
 }
 
 
 void MacroAssembler::Bic(const Register& rd,
                          const Register& rn,
-                         const Operand& operand,
-                         FlagsUpdate S) {
+                         const Operand& operand) {
+  ASSERT(allow_macro_instructions_);
+  LogicalMacro(rd, rn, operand, BIC);
+}
+
+
+void MacroAssembler::Bics(const Register& rd,
+                          const Register& rn,
+                          const Operand& operand) {
   ASSERT(allow_macro_instructions_);
-  LogicalMacro(rd, rn, operand, (S == SetFlags) ? BICS : BIC);
+  LogicalMacro(rd, rn, operand, BICS);
 }
 
 
@@ -174,7 +188,9 @@ void MacroAssembler::LogicalMacro(const Register& rd,
 }
 
 
-void MacroAssembler::Mov(const Register& rd, const Operand& operand) {
+void MacroAssembler::Mov(const Register& rd,
+                         const Operand& operand,
+                         DiscardMoveMode discard_mode) {
   ASSERT(allow_macro_instructions_);
   if (operand.IsImmediate()) {
     // Call the macro assembler for generic immediates.
@@ -191,10 +207,16 @@ void MacroAssembler::Mov(const Register& rd, const Operand& operand) {
                     operand.shift_amount());
   } else {
     // Otherwise, emit a register move only if the registers are distinct, or
-    // if they are not X registers. Note that mov(w0, w0) is not a no-op
-    // because it clears the top word of x0.
+    // if they are not X registers.
+    //
+    // Note that mov(w0, w0) is not a no-op because it clears the top word of
+    // x0. A flag is provided (kDiscardForSameWReg) if a move between the same W
+    // registers is not required to clear the top word of the X register. In
+    // this case, the instruction is discarded.
+    //
     // If the sp is an operand, add #0 is emitted, otherwise, orr #0.
-    if (!rd.Is(operand.reg()) || !rd.Is64Bits()) {
+    if (!rd.Is(operand.reg()) || (rd.Is32Bits() &&
+                                  (discard_mode == kDontDiscardForSameWReg))) {
       mov(rd, operand.reg());
     }
   }
@@ -230,19 +252,21 @@ void MacroAssembler::Mov(const Register& rd, uint64_t imm) {
   //
   // Initial values can be generated with:
   //  1. 64-bit move zero (movz).
-  //  2. 32-bit move negative (movn).
-  //  3. 64-bit move negative.
+  //  2. 32-bit move inverted (movn).
+  //  3. 64-bit move inverted.
   //  4. 32-bit orr immediate.
   //  5. 64-bit orr immediate.
-  // Move-keep may then be used to modify each of the 16-bit nybbles.
+  // Move-keep may then be used to modify each of the 16-bit half words.
   //
   // The code below supports all five initial value generators, and
-  // applying move-keep operations to move-zero initial values only.
+  // applying move-keep operations to move-zero and move-inverted initial
+  // values.
 
   unsigned reg_size = rd.size();
   unsigned n, imm_s, imm_r;
   if (IsImmMovz(imm, reg_size) && !rd.IsSP()) {
-    // Immediate can be represented in a move zero instruction.
+    // Immediate can be represented in a move zero instruction. Movz can't
+    // write to the stack pointer.
     movz(rd, imm);
   } else if (IsImmMovn(imm, reg_size) && !rd.IsSP()) {
     // Immediate can be represented in a move negative instruction. Movn can't
@@ -255,20 +279,36 @@ void MacroAssembler::Mov(const Register& rd, uint64_t imm) {
   } else {
     // Generic immediate case. Imm will be represented by
     //   [imm3, imm2, imm1, imm0], where each imm is 16 bits.
-    // A move-zero is generated for the first non-zero immX, and a move-keep
-    // for subsequent non-zero immX.
+    // A move-zero or move-inverted is generated for the first non-zero or
+    // non-0xffff immX, and a move-keep for subsequent non-zero immX.
+
+    uint64_t ignored_halfword = 0;
+    bool invert_move = false;
+    // If the number of 0xffff halfwords is greater than the number of 0x0000
+    // halfwords, it's more efficient to use move-inverted.
+    if (CountClearHalfWords(~imm, reg_size) >
+        CountClearHalfWords(imm, reg_size)) {
+      ignored_halfword = 0xffffL;
+      invert_move = true;
+    }
 
-    // Use a temporary register when moving to the stack pointer.
+    // Mov instructions can't move values into the stack pointer, so set up a
+    // temporary register, if needed.
     Register temp = rd.IsSP() ? AppropriateTempFor(rd) : rd;
 
+    // Iterate through the halfwords. Use movn/movz for the first non-ignored
+    // halfword, and movk for subsequent halfwords.
     ASSERT((reg_size % 16) == 0);
     bool first_mov_done = false;
     for (unsigned i = 0; i < (temp.size() / 16); i++) {
       uint64_t imm16 = (imm >> (16 * i)) & 0xffffL;
-      if (imm16 != 0) {
+      if (imm16 != ignored_halfword) {
         if (!first_mov_done) {
-          // Move the first non-zero 16-bit chunk into the destination register.
-          movz(temp, imm16, 16 * i);
+          if (invert_move) {
+            movn(temp, (~imm16) & 0xffffL, 16 * i);
+          } else {
+            movz(temp, imm16, 16 * i);
+          }
           first_mov_done = true;
         } else {
           // Construct a wider constant.
@@ -277,34 +317,35 @@ void MacroAssembler::Mov(const Register& rd, uint64_t imm) {
       }
     }
 
+    ASSERT(first_mov_done);
+
+    // Move the temporary if the original destination register was the stack
+    // pointer.
     if (rd.IsSP()) {
       mov(rd, temp);
     }
+  }
+}
 
-    ASSERT(first_mov_done);
+
+unsigned MacroAssembler::CountClearHalfWords(uint64_t imm, unsigned reg_size) {
+  ASSERT((reg_size % 8) == 0);
+  int count = 0;
+  for (unsigned i = 0; i < (reg_size / 16); i++) {
+    if ((imm & 0xffff) == 0) {
+      count++;
+    }
+    imm >>= 16;
   }
+  return count;
 }
 
 
-// The movz instruction can generate immediates containing an arbitrary 16-bit
+// The movn instruction can generate immediates containing an arbitrary 16-bit
 // value, with remaining bits set, eg. 0x00001234, 0x0000123400000000.
 bool MacroAssembler::IsImmMovz(uint64_t imm, unsigned reg_size) {
-  if (reg_size == kXRegSize) {
-    if (((imm & 0xffffffffffff0000UL) == 0UL) ||
-        ((imm & 0xffffffff0000ffffUL) == 0UL) ||
-        ((imm & 0xffff0000ffffffffUL) == 0UL) ||
-        ((imm & 0x0000ffffffffffffUL) == 0UL)) {
-      return true;
-    }
-  } else {
-    ASSERT(reg_size == kWRegSize);
-    imm &= kWRegMask;
-    if (((imm & 0xffff0000) == 0) ||
-        ((imm & 0x0000ffff) == 0)) {
-      return true;
-    }
-  }
-  return false;
+  ASSERT((reg_size == kXRegSize) || (reg_size == kWRegSize));
+  return CountClearHalfWords(imm, reg_size) >= ((reg_size / 16) - 1);
 }
 
 
@@ -320,7 +361,11 @@ void MacroAssembler::Ccmp(const Register& rn,
                           StatusFlags nzcv,
                           Condition cond) {
   ASSERT(allow_macro_instructions_);
-  ConditionalCompareMacro(rn, operand, nzcv, cond, CCMP);
+  if (operand.IsImmediate() && (operand.immediate() < 0)) {
+    ConditionalCompareMacro(rn, -operand.immediate(), nzcv, cond, CCMN);
+  } else {
+    ConditionalCompareMacro(rn, operand, nzcv, cond, CCMP);
+  }
 }
 
 
@@ -329,7 +374,11 @@ void MacroAssembler::Ccmn(const Register& rn,
                           StatusFlags nzcv,
                           Condition cond) {
   ASSERT(allow_macro_instructions_);
-  ConditionalCompareMacro(rn, operand, nzcv, cond, CCMN);
+  if (operand.IsImmediate() && (operand.immediate() < 0)) {
+    ConditionalCompareMacro(rn, -operand.immediate(), nzcv, cond, CCMP);
+  } else {
+    ConditionalCompareMacro(rn, operand, nzcv, cond, CCMN);
+  }
 }
 
 
@@ -347,88 +396,138 @@ void MacroAssembler::ConditionalCompareMacro(const Register& rn,
   } else {
     // The operand isn't directly supported by the instruction: perform the
     // operation on a temporary register.
-    Register temp(NoReg);
-    if (operand.IsImmediate()) {
-      temp = AppropriateTempFor(rn);
-      Mov(temp, operand.immediate());
-    } else if (operand.IsShiftedRegister()) {
-      ASSERT(operand.shift() != ROR);
-      ASSERT(is_uintn(rn.size() == kXRegSize ? kXRegSizeLog2 : kWRegSizeLog2,
-                      operand.shift_amount()));
-      temp = AppropriateTempFor(rn, operand.reg());
-      EmitShift(temp, operand.reg(), operand.shift(), operand.shift_amount());
+    Register temp = AppropriateTempFor(rn);
+    Mov(temp, operand);
+    ConditionalCompare(rn, temp, nzcv, cond, op);
+  }
+}
+
+
+void MacroAssembler::Csel(const Register& rd,
+                          const Register& rn,
+                          const Operand& operand,
+                          Condition cond) {
+  ASSERT(allow_macro_instructions_);
+  ASSERT(!rd.IsZero());
+  ASSERT(!rn.IsZero());
+  ASSERT((cond != al) && (cond != nv));
+  if (operand.IsImmediate()) {
+    // Immediate argument. Handle special cases of 0, 1 and -1 using zero
+    // register.
+    int64_t imm = operand.immediate();
+    Register zr = AppropriateZeroRegFor(rn);
+    if (imm == 0) {
+      csel(rd, rn, zr, cond);
+    } else if (imm == 1) {
+      csinc(rd, rn, zr, cond);
+    } else if (imm == -1) {
+      csinv(rd, rn, zr, cond);
     } else {
-      ASSERT(operand.IsExtendedRegister());
-      ASSERT(operand.reg().size() <= rn.size());
-      // Add/sub extended support a shift <= 4. We want to support exactly the
-      // same modes.
-      ASSERT(operand.shift_amount() <= 4);
-      ASSERT(operand.reg().Is64Bits() ||
-             ((operand.extend() != UXTX) && (operand.extend() != SXTX)));
-      temp = AppropriateTempFor(rn, operand.reg());
-      EmitExtendShift(temp, operand.reg(), operand.extend(),
-                    operand.shift_amount());
+      Register temp = AppropriateTempFor(rn);
+      Mov(temp, operand.immediate());
+      csel(rd, rn, temp, cond);
     }
-    ConditionalCompare(rn, Operand(temp), nzcv, cond, op);
+  } else if (operand.IsShiftedRegister() && (operand.shift_amount() == 0)) {
+    // Unshifted register argument.
+    csel(rd, rn, operand.reg(), cond);
+  } else {
+    // All other arguments.
+    Register temp = AppropriateTempFor(rn);
+    Mov(temp, operand);
+    csel(rd, rn, temp, cond);
   }
 }
 
 
 void MacroAssembler::Add(const Register& rd,
                          const Register& rn,
-                         const Operand& operand,
-                         FlagsUpdate S) {
+                         const Operand& operand) {
   ASSERT(allow_macro_instructions_);
   if (operand.IsImmediate() && (operand.immediate() < 0)) {
-    AddSubMacro(rd, rn, -operand.immediate(), S, SUB);
+    AddSubMacro(rd, rn, -operand.immediate(), LeaveFlags, SUB);
   } else {
-    AddSubMacro(rd, rn, operand, S, ADD);
+    AddSubMacro(rd, rn, operand, LeaveFlags, ADD);
+  }
+}
+
+
+void MacroAssembler::Adds(const Register& rd,
+                          const Register& rn,
+                          const Operand& operand) {
+  ASSERT(allow_macro_instructions_);
+  if (operand.IsImmediate() && (operand.immediate() < 0)) {
+    AddSubMacro(rd, rn, -operand.immediate(), SetFlags, SUB);
+  } else {
+    AddSubMacro(rd, rn, operand, SetFlags, ADD);
   }
 }
 
 
 void MacroAssembler::Sub(const Register& rd,
                          const Register& rn,
-                         const Operand& operand,
-                         FlagsUpdate S) {
+                         const Operand& operand) {
+  ASSERT(allow_macro_instructions_);
+  if (operand.IsImmediate() && (operand.immediate() < 0)) {
+    AddSubMacro(rd, rn, -operand.immediate(), LeaveFlags, ADD);
+  } else {
+    AddSubMacro(rd, rn, operand, LeaveFlags, SUB);
+  }
+}
+
+
+void MacroAssembler::Subs(const Register& rd,
+                          const Register& rn,
+                          const Operand& operand) {
   ASSERT(allow_macro_instructions_);
   if (operand.IsImmediate() && (operand.immediate() < 0)) {
-    AddSubMacro(rd, rn, -operand.immediate(), S, ADD);
+    AddSubMacro(rd, rn, -operand.immediate(), SetFlags, ADD);
   } else {
-    AddSubMacro(rd, rn, operand, S, SUB);
+    AddSubMacro(rd, rn, operand, SetFlags, SUB);
   }
 }
 
 
 void MacroAssembler::Cmn(const Register& rn, const Operand& operand) {
   ASSERT(allow_macro_instructions_);
-  Add(AppropriateZeroRegFor(rn), rn, operand, SetFlags);
+  Adds(AppropriateZeroRegFor(rn), rn, operand);
 }
 
 
 void MacroAssembler::Cmp(const Register& rn, const Operand& operand) {
   ASSERT(allow_macro_instructions_);
-  Sub(AppropriateZeroRegFor(rn), rn, operand, SetFlags);
+  Subs(AppropriateZeroRegFor(rn), rn, operand);
 }
 
 
 void MacroAssembler::Neg(const Register& rd,
-                         const Operand& operand,
-                         FlagsUpdate S) {
+                         const Operand& operand) {
   ASSERT(allow_macro_instructions_);
   if (operand.IsImmediate()) {
     Mov(rd, -operand.immediate());
   } else {
-    Sub(rd, AppropriateZeroRegFor(rd), operand, S);
+    Sub(rd, AppropriateZeroRegFor(rd), operand);
   }
 }
 
 
+void MacroAssembler::Negs(const Register& rd,
+                          const Operand& operand) {
+  ASSERT(allow_macro_instructions_);
+  Subs(rd, AppropriateZeroRegFor(rd), operand);
+}
+
+
 void MacroAssembler::AddSubMacro(const Register& rd,
                                  const Register& rn,
                                  const Operand& operand,
                                  FlagsUpdate S,
                                  AddSubOp op) {
+  if (operand.IsZero() && rd.Is(rn) && rd.Is64Bits() && rn.Is64Bits() &&
+      (S == LeaveFlags)) {
+    // The instruction would be a nop. Avoid generating useless code.
+    return;
+  }
+
   if ((operand.IsImmediate() && !IsImmAddSub(operand.immediate())) ||
       (rn.IsZero() && !operand.IsShiftedRegister())                ||
       (operand.IsShiftedRegister() && (operand.shift() == ROR))) {
@@ -443,28 +542,49 @@ void MacroAssembler::AddSubMacro(const Register& rd,
 
 void MacroAssembler::Adc(const Register& rd,
                          const Register& rn,
-                         const Operand& operand,
-                         FlagsUpdate S) {
+                         const Operand& operand) {
   ASSERT(allow_macro_instructions_);
-  AddSubWithCarryMacro(rd, rn, operand, S, ADC);
+  AddSubWithCarryMacro(rd, rn, operand, LeaveFlags, ADC);
+}
+
+
+void MacroAssembler::Adcs(const Register& rd,
+                          const Register& rn,
+                          const Operand& operand) {
+  ASSERT(allow_macro_instructions_);
+  AddSubWithCarryMacro(rd, rn, operand, SetFlags, ADC);
 }
 
 
 void MacroAssembler::Sbc(const Register& rd,
                          const Register& rn,
-                         const Operand& operand,
-                         FlagsUpdate S) {
+                         const Operand& operand) {
   ASSERT(allow_macro_instructions_);
-  AddSubWithCarryMacro(rd, rn, operand, S, SBC);
+  AddSubWithCarryMacro(rd, rn, operand, LeaveFlags, SBC);
+}
+
+
+void MacroAssembler::Sbcs(const Register& rd,
+                          const Register& rn,
+                          const Operand& operand) {
+  ASSERT(allow_macro_instructions_);
+  AddSubWithCarryMacro(rd, rn, operand, SetFlags, SBC);
 }
 
 
 void MacroAssembler::Ngc(const Register& rd,
-                         const Operand& operand,
-                         FlagsUpdate S) {
+                         const Operand& operand) {
   ASSERT(allow_macro_instructions_);
   Register zr = AppropriateZeroRegFor(rd);
-  Sbc(rd, zr, operand, S);
+  Sbc(rd, zr, operand);
+}
+
+
+void MacroAssembler::Ngcs(const Register& rd,
+                         const Operand& operand) {
+  ASSERT(allow_macro_instructions_);
+  Register zr = AppropriateZeroRegFor(rd);
+  Sbcs(rd, zr, operand);
 }
 
 
@@ -771,8 +891,13 @@ void MacroAssembler::Peek(const Register& dst, const Operand& offset) {
 
 void MacroAssembler::Claim(const Operand& size) {
   ASSERT(allow_macro_instructions_);
+
+  if (size.IsZero()) {
+    return;
+  }
+
   if (size.IsImmediate()) {
-    ASSERT(size.immediate() >= 0);
+    ASSERT(size.immediate() > 0);
     if (sp.Is(StackPointer())) {
       ASSERT((size.immediate() % 16) == 0);
     }
@@ -788,8 +913,13 @@ void MacroAssembler::Claim(const Operand& size) {
 
 void MacroAssembler::Drop(const Operand& size) {
   ASSERT(allow_macro_instructions_);
+
+  if (size.IsZero()) {
+    return;
+  }
+
   if (size.IsImmediate()) {
-    ASSERT(size.immediate() >= 0);
+    ASSERT(size.immediate() > 0);
     if (sp.Is(StackPointer())) {
       ASSERT((size.immediate() % 16) == 0);
     }
diff --git a/src/a64/macro-assembler-a64.h b/src/a64/macro-assembler-a64.h
index f2660637..3c52c995 100644
--- a/src/a64/macro-assembler-a64.h
+++ b/src/a64/macro-assembler-a64.h
@@ -45,6 +45,8 @@
 
 namespace vixl {
 
+enum DiscardMoveMode { kDontDiscardForSameWReg, kDiscardForSameWReg };
+
 class MacroAssembler : public Assembler {
  public:
   MacroAssembler(byte * buffer, unsigned buffer_size)
@@ -57,12 +59,16 @@ class MacroAssembler : public Assembler {
   // Logical macros.
   void And(const Register& rd,
            const Register& rn,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+  void Ands(const Register& rd,
+            const Register& rn,
+            const Operand& operand);
   void Bic(const Register& rd,
            const Register& rn,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+  void Bics(const Register& rd,
+            const Register& rn,
+            const Operand& operand);
   void Orr(const Register& rd,
            const Register& rn,
            const Operand& operand);
@@ -84,17 +90,23 @@ class MacroAssembler : public Assembler {
   // Add and sub macros.
   void Add(const Register& rd,
            const Register& rn,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+  void Adds(const Register& rd,
+            const Register& rn,
+            const Operand& operand);
   void Sub(const Register& rd,
            const Register& rn,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+  void Subs(const Register& rd,
+            const Register& rn,
+            const Operand& operand);
   void Cmn(const Register& rn, const Operand& operand);
   void Cmp(const Register& rn, const Operand& operand);
   void Neg(const Register& rd,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+  void Negs(const Register& rd,
+            const Operand& operand);
+
   void AddSubMacro(const Register& rd,
                    const Register& rn,
                    const Operand& operand,
@@ -104,15 +116,20 @@ class MacroAssembler : public Assembler {
   // Add/sub with carry macros.
   void Adc(const Register& rd,
            const Register& rn,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+  void Adcs(const Register& rd,
+            const Register& rn,
+            const Operand& operand);
   void Sbc(const Register& rd,
            const Register& rn,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+  void Sbcs(const Register& rd,
+            const Register& rn,
+            const Operand& operand);
   void Ngc(const Register& rd,
-           const Operand& operand,
-           FlagsUpdate S = LeaveFlags);
+           const Operand& operand);
+  void Ngcs(const Register& rd,
+            const Operand& operand);
   void AddSubWithCarryMacro(const Register& rd,
                             const Register& rn,
                             const Operand& operand,
@@ -121,15 +138,18 @@ class MacroAssembler : public Assembler {
 
   // Move macros.
   void Mov(const Register& rd, uint64_t imm);
-  void Mov(const Register& rd, const Operand& operand);
+  void Mov(const Register& rd,
+           const Operand& operand,
+           DiscardMoveMode discard_mode = kDontDiscardForSameWReg);
   void Mvn(const Register& rd, uint64_t imm) {
-    Mov(rd, ~imm);
+    Mov(rd, (rd.size() == kXRegSize) ? ~imm : (~imm & kWRegMask));
   };
   void Mvn(const Register& rd, const Operand& operand);
-  bool IsImmMovn(uint64_t imm, unsigned reg_size);
   bool IsImmMovz(uint64_t imm, unsigned reg_size);
+  bool IsImmMovn(uint64_t imm, unsigned reg_size);
+  unsigned CountClearHalfWords(uint64_t imm, unsigned reg_size);
 
-  // Conditional compare macros.
+  // Conditional macros.
   void Ccmp(const Register& rn,
             const Operand& operand,
             StatusFlags nzcv,
@@ -143,6 +163,10 @@ class MacroAssembler : public Assembler {
                                StatusFlags nzcv,
                                Condition cond,
                                ConditionalCompareOp op);
+  void Csel(const Register& rd,
+            const Register& rn,
+            const Operand& operand,
+            Condition cond);
 
   // Load/store macros.
 #define DECLARE_FUNCTION(FN, REGTYPE, REG, OP) \
@@ -384,17 +408,6 @@ class MacroAssembler : public Assembler {
     ASSERT(!rn.IsZero());
     cneg(rd, rn, cond);
   }
-  void Csel(const Register& rd,
-            const Register& rn,
-            const Register& rm,
-            Condition cond) {
-    ASSERT(allow_macro_instructions_);
-    ASSERT(!rd.IsZero());
-    ASSERT(!rn.IsZero());
-    ASSERT(!rm.IsZero());
-    ASSERT((cond != al) && (cond != nv));
-    csel(rd, rn, rm, cond);
-  }
   void Cset(const Register& rd, Condition cond) {
     ASSERT(allow_macro_instructions_);
     ASSERT(!rd.IsZero());
@@ -438,6 +451,14 @@ class MacroAssembler : public Assembler {
     ASSERT((cond != al) && (cond != nv));
     csneg(rd, rn, rm, cond);
   }
+  void Dmb(BarrierDomain domain, BarrierType type) {
+    ASSERT(allow_macro_instructions_);
+    dmb(domain, type);
+  }
+  void Dsb(BarrierDomain domain, BarrierType type) {
+    ASSERT(allow_macro_instructions_);
+    dsb(domain, type);
+  }
   void Extr(const Register& rd,
             const Register& rn,
             const Register& rm,
@@ -490,6 +511,16 @@ class MacroAssembler : public Assembler {
     ASSERT(allow_macro_instructions_);
     fcvt(fd, fn);
   }
+  void Fcvtas(const Register& rd, const FPRegister& fn) {
+    ASSERT(allow_macro_instructions_);
+    ASSERT(!rd.IsZero());
+    fcvtas(rd, fn);
+  }
+  void Fcvtau(const Register& rd, const FPRegister& fn) {
+    ASSERT(allow_macro_instructions_);
+    ASSERT(!rd.IsZero());
+    fcvtau(rd, fn);
+  }
   void Fcvtms(const Register& rd, const FPRegister& fn) {
     ASSERT(allow_macro_instructions_);
     ASSERT(!rd.IsZero());
@@ -528,10 +559,22 @@ class MacroAssembler : public Assembler {
     ASSERT(allow_macro_instructions_);
     fmax(fd, fn, fm);
   }
+  void Fmaxnm(const FPRegister& fd,
+              const FPRegister& fn,
+              const FPRegister& fm) {
+    ASSERT(allow_macro_instructions_);
+    fmaxnm(fd, fn, fm);
+  }
   void Fmin(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm) {
     ASSERT(allow_macro_instructions_);
     fmin(fd, fn, fm);
   }
+  void Fminnm(const FPRegister& fd,
+              const FPRegister& fn,
+              const FPRegister& fm) {
+    ASSERT(allow_macro_instructions_);
+    fminnm(fd, fn, fm);
+  }
   void Fmov(FPRegister fd, FPRegister fn) {
     ASSERT(allow_macro_instructions_);
     // Only emit an instruction if fd and fn are different, and they are both D
@@ -560,6 +603,13 @@ class MacroAssembler : public Assembler {
     ASSERT(allow_macro_instructions_);
     fmul(fd, fn, fm);
   }
+  void Fmadd(const FPRegister& fd,
+             const FPRegister& fn,
+             const FPRegister& fm,
+             const FPRegister& fa) {
+    ASSERT(allow_macro_instructions_);
+    fmadd(fd, fn, fm, fa);
+  }
   void Fmsub(const FPRegister& fd,
              const FPRegister& fn,
              const FPRegister& fm,
@@ -567,10 +617,28 @@ class MacroAssembler : public Assembler {
     ASSERT(allow_macro_instructions_);
     fmsub(fd, fn, fm, fa);
   }
+  void Fnmadd(const FPRegister& fd,
+              const FPRegister& fn,
+              const FPRegister& fm,
+              const FPRegister& fa) {
+    ASSERT(allow_macro_instructions_);
+    fnmadd(fd, fn, fm, fa);
+  }
+  void Fnmsub(const FPRegister& fd,
+              const FPRegister& fn,
+              const FPRegister& fm,
+              const FPRegister& fa) {
+    ASSERT(allow_macro_instructions_);
+    fnmsub(fd, fn, fm, fa);
+  }
   void Fneg(const FPRegister& fd, const FPRegister& fn) {
     ASSERT(allow_macro_instructions_);
     fneg(fd, fn);
   }
+  void Frinta(const FPRegister& fd, const FPRegister& fn) {
+    ASSERT(allow_macro_instructions_);
+    frinta(fd, fn);
+  }
   void Frintn(const FPRegister& fd, const FPRegister& fn) {
     ASSERT(allow_macro_instructions_);
     frintn(fd, fn);
@@ -595,6 +663,10 @@ class MacroAssembler : public Assembler {
     ASSERT(allow_macro_instructions_);
     hlt(code);
   }
+  void Isb() {
+    ASSERT(allow_macro_instructions_);
+    isb();
+  }
   void Ldnp(const CPURegister& rt,
             const CPURegister& rt2,
             const MemOperand& src) {
@@ -668,6 +740,11 @@ class MacroAssembler : public Assembler {
     ASSERT(allow_macro_instructions_);
     mov(rd, rn);
   }
+  void Movk(const Register& rd, uint64_t imm, int shift = -1) {
+    ASSERT(allow_macro_instructions_);
+    ASSERT(!rd.IsZero());
+    movk(rd, imm, shift);
+  }
   void Mrs(const Register& rt, SystemRegister sysreg) {
     ASSERT(allow_macro_instructions_);
     ASSERT(!rt.IsZero());
diff --git a/src/a64/simulator-a64.cc b/src/a64/simulator-a64.cc
index f08e0ed1..211acc59 100644
--- a/src/a64/simulator-a64.cc
+++ b/src/a64/simulator-a64.cc
@@ -57,22 +57,27 @@ SimSystemRegister SimSystemRegister::DefaultValueFor(SystemRegister id) {
 
 
 Simulator::Simulator(Decoder* decoder, FILE* stream) {
-  // Ensure shift operations act as the simulator expects.
+  // Ensure that shift operations act as the simulator expects.
   ASSERT((static_cast<int32_t>(-1) >> 1) == -1);
   ASSERT((static_cast<uint32_t>(-1) >> 1) == 0x7FFFFFFF);
 
-  // Setup the decoder.
+  // Set up the decoder.
   decoder_ = decoder;
   decoder_->AppendVisitor(this);
 
   ResetState();
 
-  // Allocate and setup the simulator stack.
-  stack_ = reinterpret_cast<byte*>(malloc(stack_size_));
+  // Allocate and set up the simulator stack.
+  stack_ = new byte[stack_size_];
   stack_limit_ = stack_ + stack_protection_size_;
-  byte* tos = stack_ + stack_size_ - stack_protection_size_;
-  // The stack pointer must be 16 bytes aligned.
-  set_sp(reinterpret_cast<int64_t>(tos) & ~0xfUL);
+  // Configure the starting stack pointer.
+  //  - Find the top of the stack.
+  uintptr_t tos = reinterpret_cast<uintptr_t>(stack_) + stack_size_;
+  //  - There's a protection region at both ends of the stack.
+  tos -= stack_protection_size_;
+  //  - The stack pointer must be 16-byte aligned.
+  tos &= ~0xfUL;
+  set_sp(tos);
 
   stream_ = stream;
   print_disasm_ = new PrintDisassembler(stream_);
@@ -100,12 +105,12 @@ void Simulator::ResetState() {
     set_dreg_bits(i, 0x7ff000007f800001UL);
   }
   // Returning to address 0 exits the Simulator.
-  set_lr(reinterpret_cast<int64_t>(kEndOfSimAddress));
+  set_lr(kEndOfSimAddress);
 }
 
 
 Simulator::~Simulator() {
-  free(stack_);
+  delete [] stack_;
   // The decoder may outlive the simulator.
   decoder_->RemoveVisitor(print_disasm_);
   delete print_disasm_;
@@ -116,6 +121,7 @@ Simulator::~Simulator() {
 
 
 void Simulator::Run() {
+  pc_modified_ = false;
   while (pc_ != kEndOfSimAddress) {
     ExecuteInstruction();
   }
@@ -123,8 +129,7 @@ void Simulator::Run() {
 
 
 void Simulator::RunFrom(Instruction* first) {
-  pc_ = first;
-  pc_modified_ = false;
+  set_pc(first);
   Run();
 }
 
@@ -394,17 +399,18 @@ void Simulator::PrintRegisters(bool print_all_regs) {
   char const * const clr_reg_value = (coloured_trace_) ? ("\033[1;36m") : ("");
 
   for (unsigned i = 0; i < kNumberOfRegisters; i++) {
-    if (print_all_regs || first_run || (last_regs[i] != registers_[i].x)) {
+    if (print_all_regs || first_run ||
+        (last_regs[i] != xreg(i, Reg31IsStackPointer))) {
       fprintf(stream_,
               "# %s%4s:%s 0x%016" PRIx64 "%s\n",
               clr_reg_name,
               XRegNameForCode(i, Reg31IsStackPointer),
               clr_reg_value,
-              registers_[i].x,
+              xreg(i, Reg31IsStackPointer),
               clr_normal);
     }
     // Cache the new register value so the next run can detect any changes.
-    last_regs[i] = registers_[i].x;
+    last_regs[i] = xreg(i, Reg31IsStackPointer);
   }
   first_run = false;
 }
@@ -424,27 +430,26 @@ void Simulator::PrintFPRegisters(bool print_all_regs) {
   // register in the same column each time (to make it easy to visually scan
   // for changes).
   for (unsigned i = 0; i < kNumberOfFPRegisters; i++) {
-    if (print_all_regs || first_run ||
-        (last_regs[i] != double_to_rawbits(fpregisters_[i].d))) {
+    if (print_all_regs || first_run || (last_regs[i] != dreg_bits(i))) {
       fprintf(stream_,
               "# %s %4s:%s 0x%016" PRIx64 "%s (%s%s:%s %g%s %s:%s %g%s)\n",
               clr_reg_name,
               VRegNameForCode(i),
               clr_reg_value,
-              double_to_rawbits(fpregisters_[i].d),
+              dreg_bits(i),
               clr_normal,
               clr_reg_name,
               DRegNameForCode(i),
               clr_reg_value,
-              fpregisters_[i].d,
+              dreg(i),
               clr_reg_name,
               SRegNameForCode(i),
               clr_reg_value,
-              fpregisters_[i].s,
+              sreg(i),
               clr_normal);
     }
     // Cache the new register value so the next run can detect any changes.
-    last_regs[i] = double_to_rawbits(fpregisters_[i].d);
+    last_regs[i] = dreg_bits(i);
   }
   first_run = false;
 }
@@ -492,7 +497,7 @@ void Simulator::VisitPCRelAddressing(Instruction* instr) {
 void Simulator::VisitUnconditionalBranch(Instruction* instr) {
   switch (instr->Mask(UnconditionalBranchMask)) {
     case BL:
-      set_lr(reinterpret_cast<int64_t>(instr->NextInstruction()));
+      set_lr(instr->NextInstruction());
       // Fall through.
     case B:
       set_pc(instr->ImmPCOffsetTarget());
@@ -515,7 +520,7 @@ void Simulator::VisitUnconditionalBranchToRegister(Instruction* instr) {
 
   switch (instr->Mask(UnconditionalBranchToRegisterMask)) {
     case BLR:
-      set_lr(reinterpret_cast<int64_t>(instr->NextInstruction()));
+      set_lr(instr->NextInstruction());
       // Fall through.
     case BR:
     case RET: set_pc(target); break;
@@ -1112,18 +1117,52 @@ void Simulator::VisitDataProcessing2Source(Instruction* instr) {
   Shift shift_op = NO_SHIFT;
   int64_t result = 0;
   switch (instr->Mask(DataProcessing2SourceMask)) {
-    case SDIV_w: result = wreg(instr->Rn()) / wreg(instr->Rm()); break;
-    case SDIV_x: result = xreg(instr->Rn()) / xreg(instr->Rm()); break;
+    case SDIV_w: {
+      int32_t rn = wreg(instr->Rn());
+      int32_t rm = wreg(instr->Rm());
+      if ((rn == kWMinInt) && (rm == -1)) {
+        result = kWMinInt;
+      } else if (rm == 0) {
+        // Division by zero can be trapped, but not on A-class processors.
+        result = 0;
+      } else {
+        result = rn / rm;
+      }
+      break;
+    }
+    case SDIV_x: {
+      int64_t rn = xreg(instr->Rn());
+      int64_t rm = xreg(instr->Rm());
+      if ((rn == kXMinInt) && (rm == -1)) {
+        result = kXMinInt;
+      } else if (rm == 0) {
+        // Division by zero can be trapped, but not on A-class processors.
+        result = 0;
+      } else {
+        result = rn / rm;
+      }
+      break;
+    }
     case UDIV_w: {
       uint32_t rn = static_cast<uint32_t>(wreg(instr->Rn()));
       uint32_t rm = static_cast<uint32_t>(wreg(instr->Rm()));
-      result = rn / rm;
+      if (rm == 0) {
+        // Division by zero can be trapped, but not on A-class processors.
+        result = 0;
+      } else {
+        result = rn / rm;
+      }
       break;
     }
     case UDIV_x: {
       uint64_t rn = static_cast<uint64_t>(xreg(instr->Rn()));
       uint64_t rm = static_cast<uint64_t>(xreg(instr->Rm()));
-      result = rn / rm;
+      if (rm == 0) {
+        // Division by zero can be trapped, but not on A-class processors.
+        result = 0;
+      } else {
+        result = rn / rm;
+      }
       break;
     }
     case LSLV_w:
@@ -1176,8 +1215,11 @@ void Simulator::VisitDataProcessing3Source(Instruction* instr) {
   unsigned reg_size = instr->SixtyFourBits() ? kXRegSize : kWRegSize;
 
   int64_t result = 0;
-  uint64_t rn;
-  uint64_t rm;
+  // Extract and sign- or zero-extend 32-bit arguments for widening operations.
+  uint64_t rn_u32 = reg<uint32_t>(instr->Rn());
+  uint64_t rm_u32 = reg<uint32_t>(instr->Rm());
+  int64_t rn_s32 = reg<int32_t>(instr->Rn());
+  int64_t rm_s32 = reg<int32_t>(instr->Rm());
   switch (instr->Mask(DataProcessing3SourceMask)) {
     case MADD_w:
     case MADD_x:
@@ -1187,22 +1229,10 @@ void Simulator::VisitDataProcessing3Source(Instruction* instr) {
     case MSUB_x:
       result = xreg(instr->Ra()) - (xreg(instr->Rn()) * xreg(instr->Rm()));
       break;
-    case SMADDL_x:
-      result = xreg(instr->Ra()) + (wreg(instr->Rn()) * wreg(instr->Rm()));
-      break;
-    case SMSUBL_x:
-      result = xreg(instr->Ra()) - (wreg(instr->Rn()) * wreg(instr->Rm()));
-      break;
-    case UMADDL_x:
-      rn = static_cast<uint32_t>(wreg(instr->Rn()));
-      rm = static_cast<uint32_t>(wreg(instr->Rm()));
-      result = xreg(instr->Ra()) + (rn * rm);
-      break;
-    case UMSUBL_x:
-      rn = static_cast<uint32_t>(wreg(instr->Rn()));
-      rm = static_cast<uint32_t>(wreg(instr->Rm()));
-      result = xreg(instr->Ra()) - (rn * rm);
-      break;
+    case SMADDL_x: result = xreg(instr->Ra()) + (rn_s32 * rm_s32); break;
+    case SMSUBL_x: result = xreg(instr->Ra()) - (rn_s32 * rm_s32); break;
+    case UMADDL_x: result = xreg(instr->Ra()) + (rn_u32 * rm_u32); break;
+    case UMSUBL_x: result = xreg(instr->Ra()) - (rn_u32 * rm_u32); break;
     case SMULH_x:
       result = MultiplyHighSigned(xreg(instr->Rn()), xreg(instr->Rm()));
       break;
@@ -1297,6 +1327,14 @@ void Simulator::VisitFPIntegerConvert(Instruction* instr) {
   FPRounding round = RMode();
 
   switch (instr->Mask(FPIntegerConvertMask)) {
+    case FCVTAS_ws: set_wreg(dst, FPToInt32(sreg(src), FPTieAway)); break;
+    case FCVTAS_xs: set_xreg(dst, FPToInt64(sreg(src), FPTieAway)); break;
+    case FCVTAS_wd: set_wreg(dst, FPToInt32(dreg(src), FPTieAway)); break;
+    case FCVTAS_xd: set_xreg(dst, FPToInt64(dreg(src), FPTieAway)); break;
+    case FCVTAU_ws: set_wreg(dst, FPToUInt32(sreg(src), FPTieAway)); break;
+    case FCVTAU_xs: set_xreg(dst, FPToUInt64(sreg(src), FPTieAway)); break;
+    case FCVTAU_wd: set_wreg(dst, FPToUInt32(dreg(src), FPTieAway)); break;
+    case FCVTAU_xd: set_xreg(dst, FPToUInt64(dreg(src), FPTieAway)); break;
     case FCVTMS_ws:
       set_wreg(dst, FPToInt32(sreg(src), FPNegativeInfinity));
       break;
@@ -1494,18 +1532,16 @@ void Simulator::VisitFPConditionalCompare(Instruction* instr) {
 void Simulator::VisitFPConditionalSelect(Instruction* instr) {
   AssertSupportedFPCR();
 
-  unsigned reg_size = instr->FPType() == FP32 ? kSRegSize : kDRegSize;
-
-  double selected_val;
+  Instr selected;
   if (ConditionPassed(static_cast<Condition>(instr->Condition()))) {
-    selected_val = fpreg(reg_size, instr->Rn());
+    selected = instr->Rn();
   } else {
-    selected_val = fpreg(reg_size, instr->Rm());
+    selected = instr->Rm();
   }
 
   switch (instr->Mask(FPConditionalSelectMask)) {
-    case FCSEL_s:
-    case FCSEL_d: set_fpreg(reg_size, instr->Rd(), selected_val); break;
+    case FCSEL_s: set_sreg(instr->Rd(), sreg(selected)); break;
+    case FCSEL_d: set_dreg(instr->Rd(), dreg(selected)); break;
     default: UNIMPLEMENTED();
   }
 }
@@ -1526,6 +1562,8 @@ void Simulator::VisitFPDataProcessing1Source(Instruction* instr) {
     case FNEG_d: set_dreg(fd, -dreg(fn)); break;
     case FSQRT_s: set_sreg(fd, sqrt(sreg(fn))); break;
     case FSQRT_d: set_dreg(fd, sqrt(dreg(fn))); break;
+    case FRINTA_s: set_sreg(fd, FPRoundInt(sreg(fn), FPTieAway)); break;
+    case FRINTA_d: set_dreg(fd, FPRoundInt(dreg(fn), FPTieAway)); break;
     case FRINTN_s: set_sreg(fd, FPRoundInt(sreg(fn), FPTieEven)); break;
     case FRINTN_d: set_dreg(fd, FPRoundInt(dreg(fn), FPTieEven)); break;
     case FRINTZ_s: set_sreg(fd, FPRoundInt(sreg(fn), FPZero)); break;
@@ -1789,6 +1827,14 @@ double Simulator::FPRoundInt(double value, FPRounding round_mode) {
   double int_result = floor(value);
   double error = value - int_result;
   switch (round_mode) {
+    case FPTieAway: {
+      // If the error is greater than 0.5, or is equal to 0.5 and the integer
+      // result is positive, round up.
+      if ((error > 0.5) || ((error == 0.5) && (int_result >= 0.0))) {
+        int_result++;
+      }
+      break;
+    }
     case FPTieEven: {
       // If the error is greater than 0.5, or is equal to 0.5 and the integer
       // result is odd, round up.
@@ -1924,6 +1970,10 @@ void Simulator::VisitFPDataProcessing2Source(Instruction* instr) {
     case FMAX_d: set_dreg(fd, FPMax(dreg(fn), dreg(fm))); break;
     case FMIN_s: set_sreg(fd, FPMin(sreg(fn), sreg(fm))); break;
     case FMIN_d: set_dreg(fd, FPMin(dreg(fn), dreg(fm))); break;
+    case FMAXNM_s: set_sreg(fd, FPMaxNM(sreg(fn), sreg(fm))); break;
+    case FMAXNM_d: set_dreg(fd, FPMaxNM(dreg(fn), dreg(fm))); break;
+    case FMINNM_s: set_sreg(fd, FPMinNM(sreg(fn), sreg(fm))); break;
+    case FMINNM_d: set_dreg(fd, FPMinNM(dreg(fn), dreg(fm))); break;
     default: UNIMPLEMENTED();
   }
 }
@@ -1937,25 +1987,34 @@ void Simulator::VisitFPDataProcessing3Source(Instruction* instr) {
   unsigned fm = instr->Rm();
   unsigned fa = instr->Ra();
 
-  // Note: The FMSUB implementation here is not precisely the same as the
-  // instruction definition. In full implementation rounding of results would
-  // occur once at the end, here rounding will occur after the first multiply
-  // and then after the subsequent addition.  A full implementation here would
-  // be possible but would require an effort isn't immediately justified given
-  // the small differences we expect to see in most cases.
-
+  // The C99 (and C++11) fma function performs a fused multiply-accumulate.
   switch (instr->Mask(FPDataProcessing3SourceMask)) {
-    case FMSUB_s: set_sreg(fd, sreg(fa) + (-sreg(fn))*sreg(fm)); break;
-    case FMSUB_d: set_dreg(fd, dreg(fa) + (-dreg(fn))*dreg(fm)); break;
+    // fd = fa +/- (fn * fm)
+    case FMADD_s: set_sreg(fd, fmaf(sreg(fn), sreg(fm), sreg(fa))); break;
+    case FMSUB_s: set_sreg(fd, fmaf(-sreg(fn), sreg(fm), sreg(fa))); break;
+    case FMADD_d: set_dreg(fd, fma(dreg(fn), dreg(fm), dreg(fa))); break;
+    case FMSUB_d: set_dreg(fd, fma(-dreg(fn), dreg(fm), dreg(fa))); break;
+    // Variants of the above where the result is negated.
+    case FNMADD_s: set_sreg(fd, -fmaf(sreg(fn), sreg(fm), sreg(fa))); break;
+    case FNMSUB_s: set_sreg(fd, -fmaf(-sreg(fn), sreg(fm), sreg(fa))); break;
+    case FNMADD_d: set_dreg(fd, -fma(dreg(fn), dreg(fm), dreg(fa))); break;
+    case FNMSUB_d: set_dreg(fd, -fma(-dreg(fn), dreg(fm), dreg(fa))); break;
     default: UNIMPLEMENTED();
   }
 }
 
 
-double Simulator::FPMax(double a, double b) {
-  if (isnan(a)) {
+template <typename T>
+T Simulator::FPMax(T a, T b) {
+  if (IsSignallingNaN(a)) {
+    return a;
+  } else if (IsSignallingNaN(b)) {
+    return b;
+  } else if (isnan(a)) {
+    ASSERT(IsQuietNaN(a));
     return a;
   } else if (isnan(b)) {
+    ASSERT(IsQuietNaN(b));
     return b;
   }
 
@@ -1969,10 +2028,28 @@ double Simulator::FPMax(double a, double b) {
 }
 
 
-double Simulator::FPMin(double a, double b) {
-  if (isnan(a)) {
+template <typename T>
+T Simulator::FPMaxNM(T a, T b) {
+  if (IsQuietNaN(a) && !IsQuietNaN(b)) {
+    a = kFP64NegativeInfinity;
+  } else if (!IsQuietNaN(a) && IsQuietNaN(b)) {
+    b = kFP64NegativeInfinity;
+  }
+  return FPMax(a, b);
+}
+
+
+template <typename T>
+T Simulator::FPMin(T a, T b) {
+  if (IsSignallingNaN(a)) {
+    return a;
+  } else if (IsSignallingNaN(b)) {
+    return b;
+  } else if (isnan(a)) {
+    ASSERT(IsQuietNaN(a));
     return a;
   } else if (isnan(b)) {
+    ASSERT(IsQuietNaN(b));
     return b;
   }
 
@@ -1985,6 +2062,16 @@ double Simulator::FPMin(double a, double b) {
   }
 }
 
+template <typename T>
+T Simulator::FPMinNM(T a, T b) {
+  if (IsQuietNaN(a) && !IsQuietNaN(b)) {
+    a = kFP64PositiveInfinity;
+  } else if (!IsQuietNaN(a) && IsQuietNaN(b)) {
+    b = kFP64PositiveInfinity;
+  }
+  return FPMin(a, b);
+}
+
 
 void Simulator::VisitSystem(Instruction* instr) {
   // Some system instructions hijack their Op and Cp fields to represent a
@@ -2015,6 +2102,8 @@ void Simulator::VisitSystem(Instruction* instr) {
       case NOP: break;
       default: UNIMPLEMENTED();
     }
+  } else if (instr->Mask(MemBarrierFMask) == MemBarrierFixed) {
+    __sync_synchronize();
   } else {
     UNIMPLEMENTED();
   }
@@ -2048,21 +2137,23 @@ void Simulator::DoPrintf(Instruction* instr) {
   ASSERT(sizeof(*instr) == 1);
   memcpy(&type, instr + kPrintfTypeOffset, sizeof(type));
 
-  const char * format = reinterpret_cast<const char *>(x0());
+  const char * format = reg<const char *>(0);
   ASSERT(format != NULL);
 
   // Pass all of the relevant PCS registers onto printf. It doesn't matter
   // if we pass too many as the extra ones won't be read.
   int result = 0;
   if (type == CPURegister::kRegister) {
-    result = printf(format, x1(), x2(), x3(), x4(), x5(), x6(), x7());
+    result = printf(format, xreg(1), xreg(2), xreg(3), xreg(4),
+                            xreg(5), xreg(6), xreg(7));
   } else if (type == CPURegister::kFPRegister) {
-    result = printf(format, d0(), d1(), d2(), d3(), d4(), d5(), d6(), d7());
+    result = printf(format, dreg(0), dreg(1), dreg(2), dreg(3),
+                            dreg(4), dreg(5), dreg(6), dreg(7));
   } else {
     ASSERT(type == CPURegister::kNoRegister);
     result = printf("%s", format);
   }
-  set_x0(result);
+  set_xreg(0, result);
 
   // TODO: Clobber all caller-saved registers here, to ensure no assumptions
   // are made about preserved state.
@@ -2071,7 +2162,7 @@ void Simulator::DoPrintf(Instruction* instr) {
   set_pc(instr->InstructionAtOffset(kPrintfLength));
 
   // Set LR as if we'd just called a native printf function.
-  set_lr(reinterpret_cast<uint64_t>(pc()));
+  set_lr(pc());
 }
 
 }  // namespace vixl
diff --git a/src/a64/simulator-a64.h b/src/a64/simulator-a64.h
index 0c22f9e7..efdb9bc2 100644
--- a/src/a64/simulator-a64.h
+++ b/src/a64/simulator-a64.h
@@ -115,6 +115,38 @@ class SimSystemRegister {
 };
 
 
+// Represent a register (r0-r31, v0-v31).
+template<int kSizeInBytes>
+class SimRegisterBase {
+ public:
+  template<typename T>
+  void Set(T new_value, unsigned size = sizeof(T)) {
+    ASSERT(size <= kSizeInBytes);
+    ASSERT(size <= sizeof(new_value));
+    // All AArch64 registers are zero-extending; Writing a W register clears the
+    // top bits of the corresponding X register.
+    memset(value_, 0, kSizeInBytes);
+    memcpy(value_, &new_value, size);
+  }
+
+  // Copy 'size' bytes of the register to the result, and zero-extend to fill
+  // the result.
+  template<typename T>
+  T Get(unsigned size = sizeof(T)) const {
+    ASSERT(size <= kSizeInBytes);
+    T result;
+    memset(&result, 0, sizeof(result));
+    memcpy(&result, value_, size);
+    return result;
+  }
+
+ protected:
+  uint8_t value_[kSizeInBytes];
+};
+typedef SimRegisterBase<kXRegSizeInBytes> SimRegister;      // r0-r31
+typedef SimRegisterBase<kDRegSizeInBytes> SimFPRegister;    // v0-v31
+
+
 class Simulator : public DecoderVisitor {
  public:
   explicit Simulator(Decoder* decoder, FILE* stream = stdout);
@@ -122,19 +154,6 @@ class Simulator : public DecoderVisitor {
 
   void ResetState();
 
-  // TODO: We assume little endianness, and the way in which the members of this
-  // union overlay. Add tests to ensure this, or fix accessors to no longer
-  // require this assumption.
-  union SimRegister {
-    int64_t x;
-    int32_t w;
-  };
-
-  union SimFPRegister {
-    double d;
-    float s;
-  };
-
   // Run the simulator.
   virtual void Run();
   void RunFrom(Instruction* first);
@@ -167,172 +186,169 @@ class Simulator : public DecoderVisitor {
   #undef DECLARE
 
   // Register accessors.
-  inline int32_t wreg(unsigned code,
-                      Reg31Mode r31mode = Reg31IsZeroRegister) const {
+
+  // Return 'size' bits of the value of an integer register, as the specified
+  // type. The value is zero-extended to fill the result.
+  //
+  // The only supported values of 'size' are kXRegSize and kWRegSize.
+  template<typename T>
+  inline T reg(unsigned size, unsigned code,
+               Reg31Mode r31mode = Reg31IsZeroRegister) const {
+    unsigned size_in_bytes = size / 8;
+    ASSERT(size_in_bytes <= sizeof(T));
+    ASSERT((size == kXRegSize) || (size == kWRegSize));
     ASSERT(code < kNumberOfRegisters);
+
     if ((code == 31) && (r31mode == Reg31IsZeroRegister)) {
-      return 0;
+      T result;
+      memset(&result, 0, sizeof(result));
+      return result;
     }
-    return registers_[code].w;
+    return registers_[code].Get<T>(size_in_bytes);
+  }
+
+  // Like reg(), but infer the access size from the template type.
+  template<typename T>
+  inline T reg(unsigned code, Reg31Mode r31mode = Reg31IsZeroRegister) const {
+    return reg<T>(sizeof(T) * 8, code, r31mode);
+  }
+
+  // Common specialized accessors for the reg() template.
+  inline int32_t wreg(unsigned code,
+                      Reg31Mode r31mode = Reg31IsZeroRegister) const {
+    return reg<int32_t>(code, r31mode);
   }
 
   inline int64_t xreg(unsigned code,
                       Reg31Mode r31mode = Reg31IsZeroRegister) const {
+    return reg<int64_t>(code, r31mode);
+  }
+
+  inline int64_t reg(unsigned size, unsigned code,
+                     Reg31Mode r31mode = Reg31IsZeroRegister) const {
+    return reg<int64_t>(size, code, r31mode);
+  }
+
+  // Write 'size' bits of 'value' into an integer register. The value is
+  // zero-extended. This behaviour matches AArch64 register writes.
+  //
+  // The only supported values of 'size' are kXRegSize and kWRegSize.
+  template<typename T>
+  inline void set_reg(unsigned size, unsigned code, T value,
+                      Reg31Mode r31mode = Reg31IsZeroRegister) {
+    unsigned size_in_bytes = size / 8;
+    ASSERT(size_in_bytes <= sizeof(T));
+    ASSERT((size == kXRegSize) || (size == kWRegSize));
     ASSERT(code < kNumberOfRegisters);
+
     if ((code == 31) && (r31mode == Reg31IsZeroRegister)) {
-      return 0;
+      return;
     }
-    return registers_[code].x;
+    return registers_[code].Set(value, size_in_bytes);
   }
 
-  inline int64_t reg(unsigned size,
-                     unsigned code,
-                     Reg31Mode r31mode = Reg31IsZeroRegister) const {
-    switch (size) {
-      case kWRegSize: return wreg(code, r31mode) & kWRegMask;
-      case kXRegSize: return xreg(code, r31mode);
-      default:
-        UNREACHABLE();
-        return 0;
-    }
+  // Like set_reg(), but infer the access size from the template type.
+  template<typename T>
+  inline void set_reg(unsigned code, T value,
+                      Reg31Mode r31mode = Reg31IsZeroRegister) {
+    set_reg(sizeof(value) * 8, code, value, r31mode);
   }
 
+  // Common specialized accessors for the set_reg() template.
   inline void set_wreg(unsigned code, int32_t value,
                        Reg31Mode r31mode = Reg31IsZeroRegister) {
-    ASSERT(code < kNumberOfRegisters);
-    if ((code == kZeroRegCode) && (r31mode == Reg31IsZeroRegister)) {
-      return;
-    }
-    registers_[code].x = 0;  // First clear the register top bits.
-    registers_[code].w = value;
+    set_reg(kWRegSize, code, value, r31mode);
   }
 
   inline void set_xreg(unsigned code, int64_t value,
                        Reg31Mode r31mode = Reg31IsZeroRegister) {
-    ASSERT(code < kNumberOfRegisters);
-    if ((code == kZeroRegCode) && (r31mode == Reg31IsZeroRegister)) {
-      return;
-    }
-    registers_[code].x = value;
+    set_reg(kXRegSize, code, value, r31mode);
   }
 
-  inline void set_reg(unsigned size, unsigned code, int64_t value,
-                      Reg31Mode r31mode = Reg31IsZeroRegister) {
-    switch (size) {
-      case kWRegSize:
-        return set_wreg(code, static_cast<int32_t>(value & 0xffffffff),
-                        r31mode);
-      case kXRegSize:
-        return set_xreg(code, value, r31mode);
-      default:
-        UNREACHABLE();
-        break;
-    }
+  // Commonly-used special cases.
+  template<typename T>
+  inline void set_lr(T value) {
+    set_reg(kLinkRegCode, value);
   }
 
-  #define REG_ACCESSORS(N)                                 \
-  inline int32_t w##N() { return wreg(N); }                \
-  inline int64_t x##N() { return xreg(N); }                \
-  inline void set_w##N(int32_t val) { set_wreg(N, val); }  \
-  inline void set_x##N(int64_t val) { set_xreg(N, val); }
-  REGISTER_CODE_LIST(REG_ACCESSORS)
-  #undef REG_ACCESSORS
-
-  // Aliases.
-  #define REG_ALIAS_ACCESSORS(N, wname, xname)                \
-  inline int32_t wname() { return wreg(N); }                  \
-  inline int64_t xname() { return xreg(N); }                  \
-  inline void set_##wname(int32_t val) { set_wreg(N, val); }  \
-  inline void set_##xname(int64_t val) { set_xreg(N, val); }
-  REG_ALIAS_ACCESSORS(30, wlr, lr);
-  #undef REG_ALIAS_ACCESSORS
+  template<typename T>
+  inline void set_sp(T value) {
+    set_reg(31, value, Reg31IsStackPointer);
+  }
 
-  // The stack is a special case in aarch64.
-  inline int32_t wsp() { return wreg(31, Reg31IsStackPointer); }
-  inline int64_t sp() { return xreg(31, Reg31IsStackPointer); }
-  inline void set_wsp(int32_t val) {
-    set_wreg(31, val, Reg31IsStackPointer);
+  // Return 'size' bits of the value of a floating-point register, as the
+  // specified type. The value is zero-extended to fill the result.
+  //
+  // The only supported values of 'size' are kDRegSize and kSRegSize.
+  template<typename T>
+  inline T fpreg(unsigned size, unsigned code) const {
+    unsigned size_in_bytes = size / 8;
+    ASSERT(size_in_bytes <= sizeof(T));
+    ASSERT((size == kDRegSize) || (size == kSRegSize));
+    ASSERT(code < kNumberOfFPRegisters);
+    return fpregisters_[code].Get<T>(size_in_bytes);
   }
-  inline void set_sp(int64_t val) {
-    set_xreg(31, val, Reg31IsStackPointer);
+
+  // Like fpreg(), but infer the access size from the template type.
+  template<typename T>
+  inline T fpreg(unsigned code) const {
+    return fpreg<T>(sizeof(T) * 8, code);
   }
 
-  // FPRegister accessors.
+  // Common specialized accessors for the fpreg() template.
   inline float sreg(unsigned code) const {
-    ASSERT(code < kNumberOfFPRegisters);
-    return fpregisters_[code].s;
+    return fpreg<float>(code);
   }
 
   inline uint32_t sreg_bits(unsigned code) const {
-    return float_to_rawbits(sreg(code));
+    return fpreg<uint32_t>(code);
   }
 
   inline double dreg(unsigned code) const {
-    ASSERT(code < kNumberOfFPRegisters);
-    return fpregisters_[code].d;
+    return fpreg<double>(code);
   }
 
   inline uint64_t dreg_bits(unsigned code) const {
-    return double_to_rawbits(dreg(code));
+    return fpreg<uint64_t>(code);
   }
 
   inline double fpreg(unsigned size, unsigned code) const {
     switch (size) {
       case kSRegSize: return sreg(code);
       case kDRegSize: return dreg(code);
-      default: {
+      default:
         UNREACHABLE();
         return 0.0;
-      }
     }
   }
 
-  inline void set_sreg(unsigned code, float val) {
+  // Write 'value' into a floating-point register. The value is zero-extended.
+  // This behaviour matches AArch64 register writes.
+  template<typename T>
+  inline void set_fpreg(unsigned code, T value) {
+    ASSERT((sizeof(value) == kDRegSizeInBytes) ||
+           (sizeof(value) == kSRegSizeInBytes));
     ASSERT(code < kNumberOfFPRegisters);
-    // Ensure that the upper word is set to 0.
-    set_dreg_bits(code, 0);
-
-    fpregisters_[code].s = val;
+    fpregisters_[code].Set(value, sizeof(value));
   }
 
-  inline void set_sreg_bits(unsigned code, uint32_t rawbits) {
-    ASSERT(code < kNumberOfFPRegisters);
-    // Ensure that the upper word is set to 0.
-    set_dreg_bits(code, 0);
-
-    set_sreg(code, rawbits_to_float(rawbits));
+  // Common specialized accessors for the set_fpreg() template.
+  inline void set_sreg(unsigned code, float value) {
+    set_fpreg(code, value);
   }
 
-  inline void set_dreg(unsigned code, double val) {
-    ASSERT(code < kNumberOfFPRegisters);
-    fpregisters_[code].d = val;
+  inline void set_sreg_bits(unsigned code, uint32_t value) {
+    set_fpreg(code, value);
   }
 
-  inline void set_dreg_bits(unsigned code, uint64_t rawbits) {
-    ASSERT(code < kNumberOfFPRegisters);
-    set_dreg(code, rawbits_to_double(rawbits));
+  inline void set_dreg(unsigned code, double value) {
+    set_fpreg(code, value);
   }
 
-  inline void set_fpreg(unsigned size, unsigned code, double value) {
-    switch (size) {
-      case kSRegSize:
-        return set_sreg(code, value);
-      case kDRegSize:
-        return set_dreg(code, value);
-      default:
-        UNREACHABLE();
-        break;
-    }
+  inline void set_dreg_bits(unsigned code, uint64_t value) {
+    set_fpreg(code, value);
   }
 
-  #define FPREG_ACCESSORS(N)                             \
-  inline float s##N() { return sreg(N); }                \
-  inline double d##N() { return dreg(N); }               \
-  inline void set_s##N(float val) { set_sreg(N, val); }  \
-  inline void set_d##N(double val) { set_dreg(N, val); }
-  REGISTER_CODE_LIST(FPREG_ACCESSORS)
-  #undef FPREG_ACCESSORS
-
   bool N() { return nzcv_.N() != 0; }
   bool Z() { return nzcv_.Z() != 0; }
   bool C() { return nzcv_.C() != 0; }
@@ -486,8 +502,18 @@ class Simulator : public DecoderVisitor {
   int64_t FPToInt64(double value, FPRounding rmode);
   uint32_t FPToUInt32(double value, FPRounding rmode);
   uint64_t FPToUInt64(double value, FPRounding rmode);
-  double FPMax(double a, double b);
-  double FPMin(double a, double b);
+
+  template <typename T>
+  T FPMax(T a, T b);
+
+  template <typename T>
+  T FPMin(T a, T b);
+
+  template <typename T>
+  T FPMaxNM(T a, T b);
+
+  template <typename T>
+  T FPMinNM(T a, T b);
 
   // Pseudo Printf instruction
   void DoPrintf(Instruction* instr);
diff --git a/src/utils.h b/src/utils.h
index 4e0b367e..15d144af 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -27,7 +27,7 @@
 #ifndef VIXL_UTILS_H
 #define VIXL_UTILS_H
 
-
+#include <math.h>
 #include <string.h>
 #include "globals.h"
 
@@ -90,12 +90,40 @@ inline int64_t signed_bitextract_64(int msb, int lsb, int64_t x) {
   return (x << (63 - msb)) >> (lsb + 63 - msb);
 }
 
-// floating point representation
+// Floating point representation.
 uint32_t float_to_rawbits(float value);
 uint64_t double_to_rawbits(double value);
 float rawbits_to_float(uint32_t bits);
 double rawbits_to_double(uint64_t bits);
 
+
+// NaN tests.
+inline bool IsSignallingNaN(double num) {
+  const uint64_t kFP64QuietNaNMask = 0x0008000000000000UL;
+  uint64_t raw = double_to_rawbits(num);
+  if (isnan(num) && ((raw & kFP64QuietNaNMask) == 0)) {
+    return true;
+  }
+  return false;
+}
+
+
+inline bool IsSignallingNaN(float num) {
+  const uint64_t kFP32QuietNaNMask = 0x00400000UL;
+  uint32_t raw = float_to_rawbits(num);
+  if (isnan(num) && ((raw & kFP32QuietNaNMask) == 0)) {
+    return true;
+  }
+  return false;
+}
+
+
+template <typename T>
+inline bool IsQuietNaN(T num) {
+  return isnan(num) && !IsSignallingNaN(num);
+}
+
+
 // Bits counting.
 int CountLeadingZeros(uint64_t value, int width);
 int CountLeadingSignBits(int64_t value, int width);
diff --git a/test/test-assembler-a64.cc b/test/test-assembler-a64.cc
index c2a021a9..c135c525 100644
--- a/test/test-assembler-a64.cc
+++ b/test/test-assembler-a64.cc
@@ -25,6 +25,7 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include <float.h>
@@ -295,6 +296,100 @@ TEST(mvn) {
 }
 
 
+TEST(mov_imm_w) {
+  SETUP();
+
+  START();
+  __ Mov(w0, 0xffffffffL);
+  __ Mov(w1, 0xffff1234L);
+  __ Mov(w2, 0x1234ffffL);
+  __ Mov(w3, 0x00000000L);
+  __ Mov(w4, 0x00001234L);
+  __ Mov(w5, 0x12340000L);
+  __ Mov(w6, 0x12345678L);
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_64(0xffffffffL, x0);
+  ASSERT_EQUAL_64(0xffff1234L, x1);
+  ASSERT_EQUAL_64(0x1234ffffL, x2);
+  ASSERT_EQUAL_64(0x00000000L, x3);
+  ASSERT_EQUAL_64(0x00001234L, x4);
+  ASSERT_EQUAL_64(0x12340000L, x5);
+  ASSERT_EQUAL_64(0x12345678L, x6);
+
+  TEARDOWN();
+}
+
+
+TEST(mov_imm_x) {
+  SETUP();
+
+  START();
+  __ Mov(x0, 0xffffffffffffffffL);
+  __ Mov(x1, 0xffffffffffff1234L);
+  __ Mov(x2, 0xffffffff12345678L);
+  __ Mov(x3, 0xffff1234ffff5678L);
+  __ Mov(x4, 0x1234ffffffff5678L);
+  __ Mov(x5, 0x1234ffff5678ffffL);
+  __ Mov(x6, 0x12345678ffffffffL);
+  __ Mov(x7, 0x1234ffffffffffffL);
+  __ Mov(x8, 0x123456789abcffffL);
+  __ Mov(x9, 0x12345678ffff9abcL);
+  __ Mov(x10, 0x1234ffff56789abcL);
+  __ Mov(x11, 0xffff123456789abcL);
+  __ Mov(x12, 0x0000000000000000L);
+  __ Mov(x13, 0x0000000000001234L);
+  __ Mov(x14, 0x0000000012345678L);
+  __ Mov(x15, 0x0000123400005678L);
+  __ Mov(x18, 0x1234000000005678L);
+  __ Mov(x19, 0x1234000056780000L);
+  __ Mov(x20, 0x1234567800000000L);
+  __ Mov(x21, 0x1234000000000000L);
+  __ Mov(x22, 0x123456789abc0000L);
+  __ Mov(x23, 0x1234567800009abcL);
+  __ Mov(x24, 0x1234000056789abcL);
+  __ Mov(x25, 0x0000123456789abcL);
+  __ Mov(x26, 0x123456789abcdef0L);
+  __ Mov(x27, 0xffff000000000001L);
+  __ Mov(x28, 0x8000ffff00000000L);
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_64(0xffffffffffff1234L, x1);
+  ASSERT_EQUAL_64(0xffffffff12345678L, x2);
+  ASSERT_EQUAL_64(0xffff1234ffff5678L, x3);
+  ASSERT_EQUAL_64(0x1234ffffffff5678L, x4);
+  ASSERT_EQUAL_64(0x1234ffff5678ffffL, x5);
+  ASSERT_EQUAL_64(0x12345678ffffffffL, x6);
+  ASSERT_EQUAL_64(0x1234ffffffffffffL, x7);
+  ASSERT_EQUAL_64(0x123456789abcffffL, x8);
+  ASSERT_EQUAL_64(0x12345678ffff9abcL, x9);
+  ASSERT_EQUAL_64(0x1234ffff56789abcL, x10);
+  ASSERT_EQUAL_64(0xffff123456789abcL, x11);
+  ASSERT_EQUAL_64(0x0000000000000000L, x12);
+  ASSERT_EQUAL_64(0x0000000000001234L, x13);
+  ASSERT_EQUAL_64(0x0000000012345678L, x14);
+  ASSERT_EQUAL_64(0x0000123400005678L, x15);
+  ASSERT_EQUAL_64(0x1234000000005678L, x18);
+  ASSERT_EQUAL_64(0x1234000056780000L, x19);
+  ASSERT_EQUAL_64(0x1234567800000000L, x20);
+  ASSERT_EQUAL_64(0x1234000000000000L, x21);
+  ASSERT_EQUAL_64(0x123456789abc0000L, x22);
+  ASSERT_EQUAL_64(0x1234567800009abcL, x23);
+  ASSERT_EQUAL_64(0x1234000056789abcL, x24);
+  ASSERT_EQUAL_64(0x0000123456789abcL, x25);
+  ASSERT_EQUAL_64(0x123456789abcdef0L, x26);
+  ASSERT_EQUAL_64(0xffff000000000001L, x27);
+  ASSERT_EQUAL_64(0x8000ffff00000000L, x28);
+
+
+  TEARDOWN();
+}
+
+
 TEST(mov) {
   SETUP();
 
@@ -341,6 +436,9 @@ TEST(mov) {
   __ Mov(w25, Operand(w13, UXTH, 2));
   __ Mov(x26, Operand(x13, SXTH, 3));
   __ Mov(x27, Operand(w13, UXTW, 4));
+
+  __ Mov(x28, 0x0123456789abcdefL);
+  __ Mov(w28, w28, kDiscardForSameWReg);
   END();
 
   RUN();
@@ -370,6 +468,7 @@ TEST(mov) {
   ASSERT_EQUAL_64(0x00007ff8, x25);
   ASSERT_EQUAL_64(0x000000000000fff0UL, x26);
   ASSERT_EQUAL_64(0x000000000001ffe0UL, x27);
+  ASSERT_EQUAL_64(0x0123456789abcdefL, x28);
 
   TEARDOWN();
 }
@@ -603,7 +702,7 @@ TEST(ands) {
 
   START();
   __ Mov(x1, 0xf00000ff);
-  __ And(w0, w1, Operand(w1), SetFlags);
+  __ Ands(w0, w1, Operand(w1));
   END();
 
   RUN();
@@ -614,7 +713,7 @@ TEST(ands) {
   START();
   __ Mov(x0, 0xfff0);
   __ Mov(x1, 0xf00000ff);
-  __ And(w0, w0, Operand(w1, LSR, 4), SetFlags);
+  __ Ands(w0, w0, Operand(w1, LSR, 4));
   END();
 
   RUN();
@@ -625,7 +724,7 @@ TEST(ands) {
   START();
   __ Mov(x0, 0x8000000000000000L);
   __ Mov(x1, 0x00000001);
-  __ And(x0, x0, Operand(x1, ROR, 1), SetFlags);
+  __ Ands(x0, x0, Operand(x1, ROR, 1));
   END();
 
   RUN();
@@ -635,7 +734,7 @@ TEST(ands) {
 
   START();
   __ Mov(x0, 0xfff0);
-  __ And(w0, w0, Operand(0xf), SetFlags);
+  __ Ands(w0, w0, Operand(0xf));
   END();
 
   RUN();
@@ -645,7 +744,7 @@ TEST(ands) {
 
   START();
   __ Mov(x0, 0xff000000);
-  __ And(w0, w0, Operand(0x80000000), SetFlags);
+  __ Ands(w0, w0, Operand(0x80000000));
   END();
 
   RUN();
@@ -741,7 +840,7 @@ TEST(bics) {
 
   START();
   __ Mov(x1, 0xffff);
-  __ Bic(w0, w1, Operand(w1), SetFlags);
+  __ Bics(w0, w1, Operand(w1));
   END();
 
   RUN();
@@ -751,7 +850,7 @@ TEST(bics) {
 
   START();
   __ Mov(x0, 0xffffffff);
-  __ Bic(w0, w0, Operand(w0, LSR, 1), SetFlags);
+  __ Bics(w0, w0, Operand(w0, LSR, 1));
   END();
 
   RUN();
@@ -762,7 +861,7 @@ TEST(bics) {
   START();
   __ Mov(x0, 0x8000000000000000L);
   __ Mov(x1, 0x00000001);
-  __ Bic(x0, x0, Operand(x1, ROR, 1), SetFlags);
+  __ Bics(x0, x0, Operand(x1, ROR, 1));
   END();
 
   RUN();
@@ -772,7 +871,7 @@ TEST(bics) {
 
   START();
   __ Mov(x0, 0xffffffffffffffffL);
-  __ Bic(x0, x0, Operand(0x7fffffffffffffffL), SetFlags);
+  __ Bics(x0, x0, Operand(0x7fffffffffffffffL));
   END();
 
   RUN();
@@ -782,7 +881,7 @@ TEST(bics) {
 
   START();
   __ Mov(w0, 0xffff0000);
-  __ Bic(w0, w0, Operand(0xfffffff0), SetFlags);
+  __ Bics(w0, w0, Operand(0xfffffff0));
   END();
 
   RUN();
@@ -983,6 +1082,29 @@ TEST(mul) {
 }
 
 
+static void SmullHelper(int64_t expected, int64_t a, int64_t b) {
+  SETUP();
+  START();
+  __ Mov(w0, a);
+  __ Mov(w1, b);
+  __ Smull(x2, w0, w1);
+  END();
+  RUN();
+  ASSERT_EQUAL_64(expected, x2);
+  TEARDOWN();
+}
+
+
+TEST(smull) {
+  SmullHelper(0, 0, 0);
+  SmullHelper(1, 1, 1);
+  SmullHelper(-1, -1, 1);
+  SmullHelper(1, -1, -1);
+  SmullHelper(0xffffffff80000000, 0x80000000, 1);
+  SmullHelper(0x0000000080000000, 0x00010000, 0x00008000);
+}
+
+
 TEST(madd) {
   SETUP();
 
@@ -1268,6 +1390,22 @@ TEST(div) {
   __ Sdiv(x13, x19, x21);
   __ Udiv(x14, x20, x21);
   __ Sdiv(x15, x20, x21);
+
+  __ Udiv(w22, w19, w17);
+  __ Sdiv(w23, w19, w17);
+  __ Udiv(x24, x20, x18);
+  __ Sdiv(x25, x20, x18);
+
+  __ Udiv(x26, x16, x21);
+  __ Sdiv(x27, x16, x21);
+  __ Udiv(x28, x18, x21);
+  __ Sdiv(x29, x18, x21);
+
+  __ Mov(x17, 0);
+  __ Udiv(w18, w16, w17);
+  __ Sdiv(w19, w16, w17);
+  __ Udiv(x20, x16, x17);
+  __ Sdiv(x21, x16, x17);
   END();
 
   RUN();
@@ -1288,6 +1426,18 @@ TEST(div) {
   ASSERT_EQUAL_64(0x40000000, x13);
   ASSERT_EQUAL_64(0x4000000000000000UL, x14);
   ASSERT_EQUAL_64(0xC000000000000000UL, x15);
+  ASSERT_EQUAL_64(0, x22);
+  ASSERT_EQUAL_64(0x80000000, x23);
+  ASSERT_EQUAL_64(0, x24);
+  ASSERT_EQUAL_64(0x8000000000000000UL, x25);
+  ASSERT_EQUAL_64(0, x26);
+  ASSERT_EQUAL_64(0, x27);
+  ASSERT_EQUAL_64(0x7fffffffffffffffUL, x28);
+  ASSERT_EQUAL_64(0, x29);
+  ASSERT_EQUAL_64(0, x18);
+  ASSERT_EQUAL_64(0, x19);
+  ASSERT_EQUAL_64(0, x20);
+  ASSERT_EQUAL_64(0, x21);
 
   TEARDOWN();
 }
@@ -1664,7 +1814,7 @@ TEST(test_branch) {
   __ Mov(x16, 0xaaaaaaaaaaaaaaaaUL);
 
   Label bz, bz_end;
-  __ Tbz(x16, 0, &bz);
+  __ Tbz(w16, 0, &bz);
   __ B(&bz_end);
   __ Bind(&bz);
   __ Mov(x0, 1);
@@ -1685,7 +1835,7 @@ TEST(test_branch) {
   __ Bind(&nbz_end);
 
   Label nbo, nbo_end;
-  __ Tbnz(x16, 2, &nbo);
+  __ Tbnz(w16, 2, &nbo);
   __ B(&nbo_end);
   __ Bind(&nbo);
   __ Mov(x3, 1);
@@ -2868,6 +3018,64 @@ TEST(add_sub_negative) {
 }
 
 
+TEST(add_sub_zero) {
+  SETUP();
+
+  START();
+  __ Mov(x0, 0);
+  __ Mov(x1, 0);
+  __ Mov(x2, 0);
+
+  Label blob1;
+  __ Bind(&blob1);
+  __ Add(x0, x0, 0);
+  __ Sub(x1, x1, 0);
+  __ Sub(x2, x2, xzr);
+  CHECK(__ SizeOfCodeGeneratedSince(&blob1) == 0);
+
+  Label blob2;
+  __ Bind(&blob2);
+  __ Add(w3, w3, 0);
+  CHECK(__ SizeOfCodeGeneratedSince(&blob2) != 0);
+
+  Label blob3;
+  __ Bind(&blob3);
+  __ Sub(w3, w3, wzr);
+  CHECK(__ SizeOfCodeGeneratedSince(&blob3) != 0);
+
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_64(0, x0);
+  ASSERT_EQUAL_64(0, x1);
+  ASSERT_EQUAL_64(0, x2);
+
+  TEARDOWN();
+}
+
+
+TEST(claim_drop_zero) {
+  SETUP();
+
+  START();
+
+  Label start;
+  __ Bind(&start);
+  __ Claim(Operand(0));
+  __ Drop(Operand(0));
+  __ Claim(Operand(xzr));
+  __ Drop(Operand(xzr));
+  CHECK(__ SizeOfCodeGeneratedSince(&start) == 0);
+
+  END();
+
+  RUN();
+
+  TEARDOWN();
+}
+
+
 TEST(neg) {
   SETUP();
 
@@ -2927,7 +3135,7 @@ TEST(adc_sbc_shift) {
   __ Mov(x4, 0xffffffffffffffffL);
 
   // Clear the C flag.
-  __ Add(x0, x0, Operand(0), SetFlags);
+  __ Adds(x0, x0, Operand(0));
 
   __ Adc(x5, x2, Operand(x3));
   __ Adc(x6, x0, Operand(x1, LSL, 60));
@@ -2988,37 +3196,126 @@ TEST(adc_sbc_shift) {
   __ Mov(x0, 1);
   __ Mov(x1, 0xffffffffffffffffL);
   // Clear the C flag.
-  __ Add(x0, x0, Operand(0), SetFlags);
-  __ Adc(x10, x0, Operand(x1), SetFlags);
+  __ Adds(x0, x0, Operand(0));
+  __ Adcs(x10, x0, Operand(x1));
   END();
 
   RUN();
 
   ASSERT_EQUAL_NZCV(ZCFlag);
+  ASSERT_EQUAL_64(0, x10);
 
   START();
   __ Mov(x0, 1);
   __ Mov(x1, 0x8000000000000000L);
   // Clear the C flag.
-  __ Add(x0, x0, Operand(0), SetFlags);
-  __ Adc(x10, x0, Operand(x1, ASR, 63), SetFlags);
+  __ Adds(x0, x0, Operand(0));
+  __ Adcs(x10, x0, Operand(x1, ASR, 63));
   END();
 
   RUN();
 
   ASSERT_EQUAL_NZCV(ZCFlag);
+  ASSERT_EQUAL_64(0, x10);
 
   START();
   __ Mov(x0, 0x10);
   __ Mov(x1, 0x07ffffffffffffffL);
   // Clear the C flag.
-  __ Add(x0, x0, Operand(0), SetFlags);
-  __ Adc(x10, x0, Operand(x1, LSL, 4), SetFlags);
+  __ Adds(x0, x0, Operand(0));
+  __ Adcs(x10, x0, Operand(x1, LSL, 4));
   END();
 
   RUN();
 
   ASSERT_EQUAL_NZCV(NVFlag);
+  ASSERT_EQUAL_64(0x8000000000000000L, x10);
+
+  // Check that sbc correctly sets the condition flags.
+  START();
+  __ Mov(x0, 0);
+  __ Mov(x1, 0xffffffffffffffffL);
+  // Clear the C flag.
+  __ Adds(x0, x0, Operand(0));
+  __ Sbcs(x10, x0, Operand(x1));
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_NZCV(ZFlag);
+  ASSERT_EQUAL_64(0, x10);
+
+  START();
+  __ Mov(x0, 1);
+  __ Mov(x1, 0xffffffffffffffffL);
+  // Clear the C flag.
+  __ Adds(x0, x0, Operand(0));
+  __ Sbcs(x10, x0, Operand(x1, LSR, 1));
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_NZCV(NFlag);
+  ASSERT_EQUAL_64(0x8000000000000001L, x10);
+
+  START();
+  __ Mov(x0, 0);
+  // Clear the C flag.
+  __ Adds(x0, x0, Operand(0));
+  __ Sbcs(x10, x0, Operand(0xffffffffffffffffL));
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_NZCV(ZFlag);
+  ASSERT_EQUAL_64(0, x10);
+
+  START()
+  __ Mov(w0, 0x7fffffff);
+  // Clear the C flag.
+  __ Adds(x0, x0, Operand(0));
+  __ Ngcs(w10, w0);
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_NZCV(NFlag);
+  ASSERT_EQUAL_64(0x80000000, x10);
+
+  START();
+  // Clear the C flag.
+  __ Adds(x0, x0, Operand(0));
+  __ Ngcs(x10, 0x7fffffffffffffffL);
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_NZCV(NFlag);
+  ASSERT_EQUAL_64(0x8000000000000000L, x10);
+
+  START()
+  __ Mov(x0, 0);
+  // Set the C flag.
+  __ Cmp(x0, Operand(x0));
+  __ Sbcs(x10, x0, Operand(1));
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_NZCV(NFlag);
+  ASSERT_EQUAL_64(0xffffffffffffffffL, x10);
+
+  START()
+  __ Mov(x0, 0);
+  // Set the C flag.
+  __ Cmp(x0, Operand(x0));
+  __ Ngcs(x10, 0x7fffffffffffffffL);
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_NZCV(NFlag);
+  ASSERT_EQUAL_64(0x8000000000000001L, x10);
 
   TEARDOWN();
 }
@@ -3029,7 +3326,7 @@ TEST(adc_sbc_extend) {
 
   START();
   // Clear the C flag.
-  __ Add(x0, x0, Operand(0), SetFlags);
+  __ Adds(x0, x0, Operand(0));
 
   __ Mov(x0, 0);
   __ Mov(x1, 1);
@@ -3082,8 +3379,8 @@ TEST(adc_sbc_extend) {
   __ Mov(x0, 0xff);
   __ Mov(x1, 0xffffffffffffffffL);
   // Clear the C flag.
-  __ Add(x0, x0, Operand(0), SetFlags);
-  __ Adc(x10, x0, Operand(x1, SXTX, 1), SetFlags);
+  __ Adds(x0, x0, Operand(0));
+  __ Adcs(x10, x0, Operand(x1, SXTX, 1));
   END();
 
   RUN();
@@ -3094,8 +3391,8 @@ TEST(adc_sbc_extend) {
   __ Mov(x0, 0x7fffffffffffffffL);
   __ Mov(x1, 1);
   // Clear the C flag.
-  __ Add(x0, x0, Operand(0), SetFlags);
-  __ Adc(x10, x0, Operand(x1, UXTB, 2), SetFlags);
+  __ Adds(x0, x0, Operand(0));
+  __ Adcs(x10, x0, Operand(x1, UXTB, 2));
   END();
 
   RUN();
@@ -3105,8 +3402,8 @@ TEST(adc_sbc_extend) {
   START();
   __ Mov(x0, 0x7fffffffffffffffL);
   // Clear the C flag.
-  __ Add(x0, x0, Operand(0), SetFlags);
-  __ Adc(x10, x0, Operand(1), SetFlags);
+  __ Adds(x0, x0, Operand(0));
+  __ Adcs(x10, x0, Operand(1));
   END();
 
   RUN();
@@ -3124,24 +3421,41 @@ TEST(adc_sbc_wide_imm) {
   __ Mov(x0, 0);
 
   // Clear the C flag.
-  __ Add(x0, x0, Operand(0), SetFlags);
+  __ Adds(x0, x0, Operand(0));
 
   __ Adc(x7, x0, Operand(0x1234567890abcdefUL));
   __ Adc(w8, w0, Operand(0xffffffff));
+  __ Sbc(x9, x0, Operand(0x1234567890abcdefUL));
+  __ Sbc(w10, w0, Operand(0xffffffff));
+  __ Ngc(x11, Operand(0xffffffff00000000UL));
+  __ Ngc(w12, Operand(0xffff0000));
 
   // Set the C flag.
   __ Cmp(w0, Operand(w0));
 
-  __ Adc(x27, x0, Operand(0x1234567890abcdefUL));
-  __ Adc(w28, w0, Operand(0xffffffff));
+  __ Adc(x18, x0, Operand(0x1234567890abcdefUL));
+  __ Adc(w19, w0, Operand(0xffffffff));
+  __ Sbc(x20, x0, Operand(0x1234567890abcdefUL));
+  __ Sbc(w21, w0, Operand(0xffffffff));
+  __ Ngc(x22, Operand(0xffffffff00000000UL));
+  __ Ngc(w23, Operand(0xffff0000));
   END();
 
   RUN();
 
   ASSERT_EQUAL_64(0x1234567890abcdefUL, x7);
   ASSERT_EQUAL_64(0xffffffff, x8);
-  ASSERT_EQUAL_64(0x1234567890abcdefUL + 1, x27);
-  ASSERT_EQUAL_64(0, x28);
+  ASSERT_EQUAL_64(0xedcba9876f543210UL, x9);
+  ASSERT_EQUAL_64(0, x10);
+  ASSERT_EQUAL_64(0xffffffff, x11);
+  ASSERT_EQUAL_64(0xffff, x12);
+
+  ASSERT_EQUAL_64(0x1234567890abcdefUL + 1, x18);
+  ASSERT_EQUAL_64(0, x19);
+  ASSERT_EQUAL_64(0xedcba9876f543211UL, x20);
+  ASSERT_EQUAL_64(1, x21);
+  ASSERT_EQUAL_64(0x100000000UL, x22);
+  ASSERT_EQUAL_64(0x10000, x23);
 
   TEARDOWN();
 }
@@ -3156,7 +3470,7 @@ TEST(flags) {
   __ Neg(x11, Operand(x1));
   __ Neg(w12, Operand(w1));
   // Clear the C flag.
-  __ Add(x0, x0, Operand(0), SetFlags);
+  __ Adds(x0, x0, Operand(0));
   __ Ngc(x13, Operand(x0));
   // Set the C flag.
   __ Cmp(x0, Operand(x0));
@@ -3271,8 +3585,8 @@ TEST(flags) {
   __ Mov(w0, 0);
   __ Mov(w1, 1);
   // Clear the C flag.
-  __ Add(w0, w0, Operand(0), SetFlags);
-  __ Ngc(w0, Operand(w1), SetFlags);
+  __ Adds(w0, w0, Operand(0));
+  __ Ngcs(w0, Operand(w1));
   END();
 
   RUN();
@@ -3284,7 +3598,7 @@ TEST(flags) {
   __ Mov(w1, 0);
   // Set the C flag.
   __ Cmp(w0, Operand(w0));
-  __ Ngc(w0, Operand(w1), SetFlags);
+  __ Ngcs(w0, Operand(w1));
   END();
 
   RUN();
@@ -3570,6 +3884,59 @@ TEST(csel) {
 }
 
 
+TEST(csel_imm) {
+  SETUP();
+
+  START();
+  __ Mov(x18, 0);
+  __ Mov(x19, 0x80000000);
+  __ Mov(x20, 0x8000000000000000UL);
+
+  __ Cmp(x18, Operand(0));
+  __ Csel(w0, w19, -2, ne);
+  __ Csel(w1, w19, -1, ne);
+  __ Csel(w2, w19, 0, ne);
+  __ Csel(w3, w19, 1, ne);
+  __ Csel(w4, w19, 2, ne);
+  __ Csel(w5, w19, Operand(w19, ASR, 31), ne);
+  __ Csel(w6, w19, Operand(w19, ROR, 1), ne);
+  __ Csel(w7, w19, 3, eq);
+
+  __ Csel(x8, x20, -2, ne);
+  __ Csel(x9, x20, -1, ne);
+  __ Csel(x10, x20, 0, ne);
+  __ Csel(x11, x20, 1, ne);
+  __ Csel(x12, x20, 2, ne);
+  __ Csel(x13, x20, Operand(x20, ASR, 63), ne);
+  __ Csel(x14, x20, Operand(x20, ROR, 1), ne);
+  __ Csel(x15, x20, 3, eq);
+
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_32(-2, w0);
+  ASSERT_EQUAL_32(-1, w1);
+  ASSERT_EQUAL_32(0, w2);
+  ASSERT_EQUAL_32(1, w3);
+  ASSERT_EQUAL_32(2, w4);
+  ASSERT_EQUAL_32(-1, w5);
+  ASSERT_EQUAL_32(0x40000000, w6);
+  ASSERT_EQUAL_32(0x80000000, w7);
+
+  ASSERT_EQUAL_64(-2, x8);
+  ASSERT_EQUAL_64(-1, x9);
+  ASSERT_EQUAL_64(0, x10);
+  ASSERT_EQUAL_64(1, x11);
+  ASSERT_EQUAL_64(2, x12);
+  ASSERT_EQUAL_64(-1, x13);
+  ASSERT_EQUAL_64(0x4000000000000000UL, x14);
+  ASSERT_EQUAL_64(0x8000000000000000UL, x15);
+
+  TEARDOWN();
+}
+
+
 TEST(lslv) {
   SETUP();
 
@@ -3846,11 +4213,11 @@ TEST(sbfm) {
   __ Sbfiz(x21, x2, 8, 16);
   __ Sbfx(x22, x1, 8, 16);
   __ Sbfx(x23, x2, 8, 16);
-  __ Sxtb(x24, x1);
+  __ Sxtb(x24, w1);
   __ Sxtb(x25, x2);
-  __ Sxth(x26, x1);
+  __ Sxth(x26, w1);
   __ Sxth(x27, x2);
-  __ Sxtw(x28, x1);
+  __ Sxtw(x28, w1);
   __ Sxtw(x29, x2);
   END();
 
@@ -4177,85 +4544,215 @@ TEST(fmul) {
 }
 
 
-TEST(fmsub) {
+static void FmaddFmsubDoubleHelper(double n, double m, double a,
+                                   double fmadd, double fmsub) {
   SETUP();
+  START();
+
+  __ Fmov(d0, n);
+  __ Fmov(d1, m);
+  __ Fmov(d2, a);
+  __ Fmadd(d28, d0, d1, d2);
+  __ Fmsub(d29, d0, d1, d2);
+  __ Fnmadd(d30, d0, d1, d2);
+  __ Fnmsub(d31, d0, d1, d2);
+
+  END();
+  RUN();
+
+  ASSERT_EQUAL_FP64(fmadd, d28);
+  ASSERT_EQUAL_FP64(fmsub, d29);
+  ASSERT_EQUAL_FP64(-fmadd, d30);
+  ASSERT_EQUAL_FP64(-fmsub, d31);
+
+  TEARDOWN();
+}
+
 
+TEST(fmadd_fmsub_double) {
+  double inputs[] = {
+    // Normal numbers, including -0.0.
+    DBL_MAX, DBL_MIN, 3.25, 2.0, 0.0,
+    -DBL_MAX, -DBL_MIN, -3.25, -2.0, -0.0,
+    // Infinities.
+    kFP64NegativeInfinity, kFP64PositiveInfinity,
+    // Subnormal numbers.
+    rawbits_to_double(0x000fffffffffffff),
+    rawbits_to_double(0x0000000000000001),
+    rawbits_to_double(0x000123456789abcd),
+    -rawbits_to_double(0x000fffffffffffff),
+    -rawbits_to_double(0x0000000000000001),
+    -rawbits_to_double(0x000123456789abcd),
+    // NaN.
+    kFP64QuietNaN,
+    -kFP64QuietNaN,
+  };
+  const int count = sizeof(inputs) / sizeof(inputs[0]);
+
+  for (int in = 0; in < count; in++) {
+    double n = inputs[in];
+    for (int im = 0; im < count; im++) {
+      double m = inputs[im];
+      for (int ia = 0; ia < count; ia++) {
+        double a = inputs[ia];
+        double fmadd = fma(n, m, a);
+        double fmsub = fma(-n, m, a);
+
+        FmaddFmsubDoubleHelper(n, m, a, fmadd, fmsub);
+      }
+    }
+  }
+}
+
+
+TEST(fmadd_fmsub_double_rounding) {
+  // Make sure we run plenty of tests where an intermediate rounding stage would
+  // produce an incorrect result.
+  const int limit = 1000;
+  int count_fmadd = 0;
+  int count_fmsub = 0;
+
+  uint16_t seed[3] = {42, 43, 44};
+  seed48(seed);
+
+  while ((count_fmadd < limit) || (count_fmsub < limit)) {
+    double n, m, a;
+    uint32_t r[2];
+    ASSERT(sizeof(r) == sizeof(n));
+
+    r[0] = mrand48();
+    r[1] = mrand48();
+    memcpy(&n, r, sizeof(r));
+    r[0] = mrand48();
+    r[1] = mrand48();
+    memcpy(&m, r, sizeof(r));
+    r[0] = mrand48();
+    r[1] = mrand48();
+    memcpy(&a, r, sizeof(r));
+
+    if (!isfinite(a) || !isfinite(n) || !isfinite(m)) {
+      continue;
+    }
+
+    // Calculate the expected results.
+    double fmadd = fma(n, m, a);
+    double fmsub = fma(-n, m, a);
+
+    bool test_fmadd = (fmadd != (a + n * m));
+    bool test_fmsub = (fmsub != (a - n * m));
+
+    // If rounding would produce a different result, increment the test count.
+    count_fmadd += test_fmadd;
+    count_fmsub += test_fmsub;
+
+    if (test_fmadd || test_fmsub) {
+      FmaddFmsubDoubleHelper(n, m, a, fmadd, fmsub);
+    }
+  }
+}
+
+
+static void FmaddFmsubFloatHelper(float n, float m, float a,
+                                  float fmadd, float fmsub) {
+  SETUP();
   START();
-  __ Fmov(s16, 3.25);
-  __ Fmov(s17, 2.0);
-  __ Fmov(s18, 0);
-  __ Fmov(s19, -0.5);
-  __ Fmov(s20, kFP32PositiveInfinity);
-  __ Fmov(s21, kFP32NegativeInfinity);
-  __ Fmov(s22, -0);
 
-  __ Fmov(d29, 0);
-  __ Fmov(d30, -2.0);
-  __ Fmov(d31, 2.25);
-  __ Fmov(d28, 4);
-  __ Fmov(d24, kFP64PositiveInfinity);
-  __ Fmov(d25, kFP64NegativeInfinity);
-  __ Fmov(d26, -0);
-
-     // Normal combinations
-  __ Fmsub(s0, s16, s17, s18);
-  __ Fmsub(s1, s17, s18, s16);
-  __ Fmsub(s2, s17, s16, s19);
-     // Pos/Neg Infinity
-  __ Fmsub(s3, s16, s21, s19);
-  __ Fmsub(s4, s17, s16, s20);
-  __ Fmsub(s5, s20, s16, s19);
-  __ Fmsub(s6, s21, s16, s19);
-     // -0
-  __ Fmsub(s7, s22, s16, s19);
-  __ Fmsub(s8, s19, s16, s22);
-
-     // Normal combinations
-  __ Fmsub(d9, d30, d31, d29);
-  __ Fmsub(d10, d29, d31, d30);
-  __ Fmsub(d11, d30, d31, d28);
-     // Pos/Neg Infinity
-  __ Fmsub(d12, d30, d24, d28);
-  __ Fmsub(d13, d24, d31, d25);
-  __ Fmsub(d14, d24, d31, d28);
-  __ Fmsub(d15, d25, d31, d28);
-     // -0
-  __ Fmsub(d16, d26, d31, d28);
-  __ Fmsub(d17, d30, d26, d28);
-  END();
-
-  RUN();
-
-  // Normal combinations
-  ASSERT_EQUAL_FP32(-6.5, s0);
-  ASSERT_EQUAL_FP32(3.25, s1);
-  ASSERT_EQUAL_FP32(-7, s2);
-  // Pos/Neg Infinity
-  ASSERT_EQUAL_FP32(kFP32PositiveInfinity, s3);
-  ASSERT_EQUAL_FP32(kFP32PositiveInfinity, s4);
-  ASSERT_EQUAL_FP32(kFP32NegativeInfinity, s5);
-  ASSERT_EQUAL_FP32(kFP32PositiveInfinity, s6);
-  // -0
-  ASSERT_EQUAL_FP32(-0.5, s7);
-  ASSERT_EQUAL_FP32(1.625, s8);
+  __ Fmov(s0, n);
+  __ Fmov(s1, m);
+  __ Fmov(s2, a);
+  __ Fmadd(s30, s0, s1, s2);
+  __ Fmsub(s31, s0, s1, s2);
+
+  END();
+  RUN();
 
-  // Normal combinations
-  ASSERT_EQUAL_FP64(4.5, d9);
-  ASSERT_EQUAL_FP64(-2.0, d10);
-  ASSERT_EQUAL_FP64(8.5, d11);
-  // Pos/Neg Infinity
-  ASSERT_EQUAL_FP64(kFP64PositiveInfinity, d12);
-  ASSERT_EQUAL_FP64(kFP64NegativeInfinity, d13);
-  ASSERT_EQUAL_FP64(kFP64NegativeInfinity, d14);
-  ASSERT_EQUAL_FP64(kFP64PositiveInfinity, d15);
-  // -0
-  ASSERT_EQUAL_FP64(4.0, d16);
-  ASSERT_EQUAL_FP64(4.0, d17);
+  ASSERT_EQUAL_FP32(fmadd, s30);
+  ASSERT_EQUAL_FP32(fmsub, s31);
 
   TEARDOWN();
 }
 
 
+TEST(fmadd_fmsub_float) {
+  float inputs[] = {
+    // Normal numbers, including -0.0f.
+    FLT_MAX, FLT_MIN, 3.25f, 2.0f, 0.0f,
+    -FLT_MAX, -FLT_MIN, -3.25f, -2.0f, -0.0f,
+    // Infinities.
+    kFP32NegativeInfinity, kFP32PositiveInfinity,
+    // Subnormal numbers.
+    rawbits_to_float(0x07ffffff),
+    rawbits_to_float(0x00000001),
+    rawbits_to_float(0x01234567),
+    -rawbits_to_float(0x07ffffff),
+    -rawbits_to_float(0x00000001),
+    -rawbits_to_float(0x01234567),
+    // NaN.
+    kFP32QuietNaN,
+    -kFP32QuietNaN,
+  };
+  const int count = sizeof(inputs) / sizeof(inputs[0]);
+
+  for (int in = 0; in < count; in++) {
+    float n = inputs[in];
+    for (int im = 0; im < count; im++) {
+      float m = inputs[im];
+      for (int ia = 0; ia < count; ia++) {
+        float a = inputs[ia];
+        float fmadd = fmaf(n, m, a);
+        float fmsub = fmaf(-n, m, a);
+
+        FmaddFmsubFloatHelper(n, m, a, fmadd, fmsub);
+      }
+    }
+  }
+}
+
+
+TEST(fmadd_fmsub_float_rounding) {
+  // Make sure we run plenty of tests where an intermediate rounding stage would
+  // produce an incorrect result.
+  const int limit = 1000;
+  int count_fmadd = 0;
+  int count_fmsub = 0;
+
+  uint16_t seed[3] = {42, 43, 44};
+  seed48(seed);
+
+  while ((count_fmadd < limit) || (count_fmsub < limit)) {
+    float n, m, a;
+    uint32_t r;
+    ASSERT(sizeof(r) == sizeof(n));
+
+    r = mrand48();
+    memcpy(&n, &r, sizeof(r));
+    r = mrand48();
+    memcpy(&m, &r, sizeof(r));
+    r = mrand48();
+    memcpy(&a, &r, sizeof(r));
+
+    if (!isfinite(a) || !isfinite(n) || !isfinite(m)) {
+      continue;
+    }
+
+    // Calculate the expected results.
+    float fmadd = fmaf(n, m, a);
+    float fmsub = fmaf(-n, m, a);
+
+    bool test_fmadd = (fmadd != (a + n * m));
+    bool test_fmsub = (fmsub != (a - n * m));
+
+    // If rounding would produce a different result, increment the test count.
+    count_fmadd += test_fmadd;
+    count_fmsub += test_fmsub;
+
+    if (test_fmadd || test_fmsub) {
+      FmaddFmsubFloatHelper(n, m, a, fmadd, fmsub);
+    }
+  }
+}
+
+
 TEST(fdiv) {
   SETUP();
 
@@ -4304,171 +4801,219 @@ TEST(fdiv) {
 }
 
 
-TEST(fmin_s) {
-  SETUP();
+static float MinMaxHelper(float n,
+                          float m,
+                          bool min,
+                          float quiet_nan_substitute = 0.0) {
+  const uint64_t kFP32QuietNaNMask = 0x00400000UL;
+  uint32_t raw_n = float_to_rawbits(n);
+  uint32_t raw_m = float_to_rawbits(m);
+
+  if (isnan(n) && ((raw_n & kFP32QuietNaNMask) == 0)) {
+    // n is signalling NaN.
+    return n;
+  } else if (isnan(m) && ((raw_m & kFP32QuietNaNMask) == 0)) {
+    // m is signalling NaN.
+    return m;
+  } else if (quiet_nan_substitute == 0.0) {
+    if (isnan(n)) {
+      // n is quiet NaN.
+      return n;
+    } else if (isnan(m)) {
+      // m is quiet NaN.
+      return m;
+    }
+  } else {
+    // Substitute n or m if one is quiet, but not both.
+    if (isnan(n) && !isnan(m)) {
+      // n is quiet NaN: replace with substitute.
+      n = quiet_nan_substitute;
+    } else if (!isnan(n) && isnan(m)) {
+      // m is quiet NaN: replace with substitute.
+      m = quiet_nan_substitute;
+    }
+  }
 
-  START();
-  __ Fmov(s25, 0.0);
-  __ Fneg(s26, s25);
-  __ Fmov(s27, kFP32PositiveInfinity);
-  __ Fmov(s28, 1.0);
-  __ Fmin(s0, s25, s26);
-  __ Fmin(s1, s27, s28);
-  __ Fmin(s2, s28, s26);
-  END();
+  if ((n == 0.0) && (m == 0.0) &&
+      (copysign(1.0, n) != copysign(1.0, m))) {
+    return min ? -0.0 : 0.0;
+  }
 
-  RUN();
+  return min ? fminf(n, m) : fmaxf(n, m);
+}
+
+
+static double MinMaxHelper(double n,
+                           double m,
+                           bool min,
+                           double quiet_nan_substitute = 0.0) {
+  const uint64_t kFP64QuietNaNMask = 0x0008000000000000UL;
+  uint64_t raw_n = double_to_rawbits(n);
+  uint64_t raw_m = double_to_rawbits(m);
+
+  if (isnan(n) && ((raw_n & kFP64QuietNaNMask) == 0)) {
+    // n is signalling NaN.
+    return n;
+  } else if (isnan(m) && ((raw_m & kFP64QuietNaNMask) == 0)) {
+    // m is signalling NaN.
+    return m;
+  } else if (quiet_nan_substitute == 0.0) {
+    if (isnan(n)) {
+      // n is quiet NaN.
+      return n;
+    } else if (isnan(m)) {
+      // m is quiet NaN.
+      return m;
+    }
+  } else {
+    // Substitute n or m if one is quiet, but not both.
+    if (isnan(n) && !isnan(m)) {
+      // n is quiet NaN: replace with substitute.
+      n = quiet_nan_substitute;
+    } else if (!isnan(n) && isnan(m)) {
+      // m is quiet NaN: replace with substitute.
+      m = quiet_nan_substitute;
+    }
+  }
 
-  ASSERT_EQUAL_FP32(-0.0, s0);
-  ASSERT_EQUAL_FP32(1.0, s1);
-  ASSERT_EQUAL_FP32(-0.0, s2);
+  if ((n == 0.0) && (m == 0.0) &&
+      (copysign(1.0, n) != copysign(1.0, m))) {
+    return min ? -0.0 : 0.0;
+  }
 
-  TEARDOWN();
+  return min ? fmin(n, m) : fmax(n, m);
 }
 
 
-TEST(fmin_d) {
+static void FminFmaxDoubleHelper(double n, double m, double min, double max,
+                                 double minnm, double maxnm) {
   SETUP();
 
   START();
-  __ Fmov(d25, 0.0);
-  __ Fneg(d26, d25);
-  __ Fmov(d27, kFP32PositiveInfinity);
-  __ Fneg(d28, d27);
-  __ Fmov(d29, 1.0);
-
-  for (unsigned j = 0; j < 5; j++) {
-    for (unsigned i = 0; i < 5; i++) {
-      // Test all combinations, writing results into d0 - d24.
-      __ Fmin(FPRegister::DRegFromCode(i + 5*j),
-              FPRegister::DRegFromCode(i + 25),
-              FPRegister::DRegFromCode(j + 25));
-    }
-  }
+  __ Fmov(d0, n);
+  __ Fmov(d1, m);
+  __ Fmin(d28, d0, d1);
+  __ Fmax(d29, d0, d1);
+  __ Fminnm(d30, d0, d1);
+  __ Fmaxnm(d31, d0, d1);
   END();
 
   RUN();
 
-  // Second register is 0.0.
-  ASSERT_EQUAL_FP64(0.0, d0);
-  ASSERT_EQUAL_FP64(-0.0, d1);
-  ASSERT_EQUAL_FP64(0.0, d2);
-  ASSERT_EQUAL_FP64(kFP32NegativeInfinity, d3);
-  ASSERT_EQUAL_FP64(0.0, d4);
-
-  // Second register is -0.0.
-  ASSERT_EQUAL_FP64(-0.0, d5);
-  ASSERT_EQUAL_FP64(-0.0, d6);
-  ASSERT_EQUAL_FP64(-0.0, d7);
-  ASSERT_EQUAL_FP64(kFP32NegativeInfinity, d8);
-  ASSERT_EQUAL_FP64(-0.0, d9);
+  ASSERT_EQUAL_FP64(min, d28);
+  ASSERT_EQUAL_FP64(max, d29);
+  ASSERT_EQUAL_FP64(minnm, d30);
+  ASSERT_EQUAL_FP64(maxnm, d31);
 
-  // Second register is +Inf.
-  ASSERT_EQUAL_FP64(0.0, d10);
-  ASSERT_EQUAL_FP64(-0.0, d11);
-  ASSERT_EQUAL_FP64(kFP32PositiveInfinity, d12);
-  ASSERT_EQUAL_FP64(kFP32NegativeInfinity, d13);
-  ASSERT_EQUAL_FP64(1.0, d14);
+  TEARDOWN();
+}
 
-  // Second register is -Inf.
-  ASSERT_EQUAL_FP64(kFP32NegativeInfinity, d15);
-  ASSERT_EQUAL_FP64(kFP32NegativeInfinity, d16);
-  ASSERT_EQUAL_FP64(kFP32NegativeInfinity, d17);
-  ASSERT_EQUAL_FP64(kFP32NegativeInfinity, d18);
-  ASSERT_EQUAL_FP64(kFP32NegativeInfinity, d19);
 
-  // Second register is 1.0.
-  ASSERT_EQUAL_FP64(0.0, d20);
-  ASSERT_EQUAL_FP64(-0.0, d21);
-  ASSERT_EQUAL_FP64(1.0, d22);
-  ASSERT_EQUAL_FP64(kFP32NegativeInfinity, d23);
-  ASSERT_EQUAL_FP64(1.0, d24);
-
-  TEARDOWN();
+TEST(fmax_fmin_d) {
+  // Bootstrap tests.
+  FminFmaxDoubleHelper(0, 0, 0, 0, 0, 0);
+  FminFmaxDoubleHelper(0, 1, 0, 1, 0, 1);
+  FminFmaxDoubleHelper(kFP64PositiveInfinity, kFP64NegativeInfinity,
+                       kFP64NegativeInfinity, kFP64PositiveInfinity,
+                       kFP64NegativeInfinity, kFP64PositiveInfinity);
+  FminFmaxDoubleHelper(kFP64SignallingNaN, 0,
+                       kFP64SignallingNaN, kFP64SignallingNaN,
+                       kFP64SignallingNaN, kFP64SignallingNaN);
+  FminFmaxDoubleHelper(kFP64QuietNaN, 0,
+                       kFP64QuietNaN, kFP64QuietNaN,
+                       0, 0);
+  FminFmaxDoubleHelper(kFP64QuietNaN, kFP64SignallingNaN,
+                       kFP64SignallingNaN, kFP64SignallingNaN,
+                       kFP64SignallingNaN, kFP64SignallingNaN);
+
+  // Iterate over all combinations of inputs.
+  double inputs[] = { DBL_MAX, DBL_MIN, 1.0, 0.0,
+                      -DBL_MAX, -DBL_MIN, -1.0, -0.0,
+                      kFP64PositiveInfinity, kFP64NegativeInfinity,
+                      kFP64QuietNaN, kFP64SignallingNaN };
+
+  const int count = sizeof(inputs) / sizeof(inputs[0]);
+
+  for (int in = 0; in < count; in++) {
+    double n = inputs[in];
+    for (int im = 0; im < count; im++) {
+      double m = inputs[im];
+      FminFmaxDoubleHelper(n, m,
+                           MinMaxHelper(n, m, true),
+                           MinMaxHelper(n, m, false),
+                           MinMaxHelper(n, m, true, kFP64PositiveInfinity),
+                           MinMaxHelper(n, m, false, kFP64NegativeInfinity));
+    }
+  }
 }
 
 
-TEST(fmax_s) {
+static void FminFmaxFloatHelper(float n, float m, float min, float max,
+                                float minnm, float maxnm) {
   SETUP();
 
   START();
-  __ Fmov(s25, 0.0);
-  __ Fneg(s26, s25);
-  __ Fmov(s27, kFP32PositiveInfinity);
-  __ Fmov(s28, 1.0);
-  __ Fmax(s0, s25, s26);
-  __ Fmax(s1, s27, s28);
-  __ Fmax(s2, s28, s26);
+  // TODO: Signalling NaNs are sometimes converted by the C compiler to quiet
+  // NaNs on implicit casts from float to double. Here, we move the raw bits
+  // into a W register first, so we get the correct value. Fix Fmov so this
+  // additional step is no longer needed.
+  __ Mov(w0, float_to_rawbits(n));
+  __ Fmov(s0, w0);
+  __ Mov(w0, float_to_rawbits(m));
+  __ Fmov(s1, w0);
+  __ Fmin(s28, s0, s1);
+  __ Fmax(s29, s0, s1);
+  __ Fminnm(s30, s0, s1);
+  __ Fmaxnm(s31, s0, s1);
   END();
 
   RUN();
 
-  ASSERT_EQUAL_FP32(0.0, s0);
-  ASSERT_EQUAL_FP32(kFP32PositiveInfinity, s1);
-  ASSERT_EQUAL_FP32(1.0, s2);
+  ASSERT_EQUAL_FP32(min, s28);
+  ASSERT_EQUAL_FP32(max, s29);
+  ASSERT_EQUAL_FP32(minnm, s30);
+  ASSERT_EQUAL_FP32(maxnm, s31);
 
   TEARDOWN();
 }
 
 
-TEST(fmax_d) {
-  SETUP();
-
-  START();
-  __ Fmov(d25, 0.0);
-  __ Fneg(d26, d25);
-  __ Fmov(d27, kFP32PositiveInfinity);
-  __ Fneg(d28, d27);
-  __ Fmov(d29, 1.0);
-
-  for (unsigned j = 0; j < 5; j++) {
-    for (unsigned i = 0; i < 5; i++) {
-      // Test all combinations, writing results into d0 - d24.
-      __ Fmax(FPRegister::DRegFromCode(i + 5*j),
-              FPRegister::DRegFromCode(i + 25),
-              FPRegister::DRegFromCode(j + 25));
+TEST(fmax_fmin_s) {
+  // Bootstrap tests.
+  FminFmaxFloatHelper(0, 0, 0, 0, 0, 0);
+  FminFmaxFloatHelper(0, 1, 0, 1, 0, 1);
+  FminFmaxFloatHelper(kFP32PositiveInfinity, kFP32NegativeInfinity,
+                      kFP32NegativeInfinity, kFP32PositiveInfinity,
+                      kFP32NegativeInfinity, kFP32PositiveInfinity);
+  FminFmaxFloatHelper(kFP32SignallingNaN, 0,
+                      kFP32SignallingNaN, kFP32SignallingNaN,
+                      kFP32SignallingNaN, kFP32SignallingNaN);
+  FminFmaxFloatHelper(kFP32QuietNaN, 0,
+                      kFP32QuietNaN, kFP32QuietNaN,
+                      0, 0);
+  FminFmaxFloatHelper(kFP32QuietNaN, kFP32SignallingNaN,
+                      kFP32SignallingNaN, kFP32SignallingNaN,
+                      kFP32SignallingNaN, kFP32SignallingNaN);
+
+  // Iterate over all combinations of inputs.
+  float inputs[] = { FLT_MAX, FLT_MIN, 1.0, 0.0,
+                     -FLT_MAX, -FLT_MIN, -1.0, -0.0,
+                     kFP32PositiveInfinity, kFP32NegativeInfinity,
+                     kFP32QuietNaN, kFP32SignallingNaN };
+
+  const int count = sizeof(inputs) / sizeof(inputs[0]);
+
+  for (int in = 0; in < count; in++) {
+    float n = inputs[in];
+    for (int im = 0; im < count; im++) {
+      float m = inputs[im];
+      FminFmaxFloatHelper(n, m,
+                          MinMaxHelper(n, m, true),
+                          MinMaxHelper(n, m, false),
+                          MinMaxHelper(n, m, true, kFP32PositiveInfinity),
+                          MinMaxHelper(n, m, false, kFP32NegativeInfinity));
     }
   }
-  END();
-
-  RUN();
-
-  // Second register is 0.0.
-  ASSERT_EQUAL_FP64(0.0, d0);
-  ASSERT_EQUAL_FP64(0.0, d1);
-  ASSERT_EQUAL_FP64(kFP32PositiveInfinity, d2);
-  ASSERT_EQUAL_FP64(0.0, d3);
-  ASSERT_EQUAL_FP64(1.0, d4);
-
-  // Second register is -0.0.
-  ASSERT_EQUAL_FP64(0.0, d5);
-  ASSERT_EQUAL_FP64(-0.0, d6);
-  ASSERT_EQUAL_FP64(kFP32PositiveInfinity, d7);
-  ASSERT_EQUAL_FP64(-0.0, d8);
-  ASSERT_EQUAL_FP64(1.0, d9);
-
-  // Second register is +Inf.
-  ASSERT_EQUAL_FP64(kFP32PositiveInfinity, d10);
-  ASSERT_EQUAL_FP64(kFP32PositiveInfinity, d11);
-  ASSERT_EQUAL_FP64(kFP32PositiveInfinity, d12);
-  ASSERT_EQUAL_FP64(kFP32PositiveInfinity, d13);
-  ASSERT_EQUAL_FP64(kFP32PositiveInfinity, d14);
-
-  // Second register is -Inf.
-  ASSERT_EQUAL_FP64(0.0, d15);
-  ASSERT_EQUAL_FP64(-0.0, d16);
-  ASSERT_EQUAL_FP64(kFP32PositiveInfinity, d17);
-  ASSERT_EQUAL_FP64(kFP32NegativeInfinity, d18);
-  ASSERT_EQUAL_FP64(1.0, d19);
-
-  // Second register is 1.0.
-  ASSERT_EQUAL_FP64(1.0, d20);
-  ASSERT_EQUAL_FP64(1.0, d21);
-  ASSERT_EQUAL_FP64(kFP32PositiveInfinity, d22);
-  ASSERT_EQUAL_FP64(1.0, d23);
-  ASSERT_EQUAL_FP64(1.0, d24);
-
-  TEARDOWN();
 }
 
 
@@ -4542,6 +5087,11 @@ TEST(fcmp) {
   SETUP();
 
   START();
+
+  // Some of these tests require a floating-point scratch register assigned to
+  // the macro assembler, but most do not.
+  __ SetFPScratchRegister(NoFPReg);
+
   __ Fmov(s8, 0.0);
   __ Fmov(s9, 0.5);
   __ Mov(w18, 0x7f800001);  // Single precision NaN.
@@ -4559,7 +5109,9 @@ TEST(fcmp) {
   __ Mrs(x4, NZCV);
   __ Fcmp(s8, 0.0);
   __ Mrs(x5, NZCV);
+  __ SetFPScratchRegister(d0);
   __ Fcmp(s8, 255.0);
+  __ SetFPScratchRegister(NoFPReg);
   __ Mrs(x6, NZCV);
 
   __ Fmov(d19, 0.0);
@@ -4579,7 +5131,9 @@ TEST(fcmp) {
   __ Mrs(x14, NZCV);
   __ Fcmp(d19, 0.0);
   __ Mrs(x15, NZCV);
+  __ SetFPScratchRegister(d0);
   __ Fcmp(d19, 12.3456);
+  __ SetFPScratchRegister(NoFPReg);
   __ Mrs(x16, NZCV);
   END();
 
@@ -4766,6 +5320,88 @@ TEST(fsqrt) {
 }
 
 
+TEST(frinta) {
+  SETUP();
+
+  START();
+  __ Fmov(s16, 1.0);
+  __ Fmov(s17, 1.1);
+  __ Fmov(s18, 1.5);
+  __ Fmov(s19, 1.9);
+  __ Fmov(s20, 2.5);
+  __ Fmov(s21, -1.5);
+  __ Fmov(s22, -2.5);
+  __ Fmov(s23, kFP32PositiveInfinity);
+  __ Fmov(s24, kFP32NegativeInfinity);
+  __ Fmov(s25, 0.0);
+  __ Fmov(s26, -0.0);
+
+  __ Frinta(s0, s16);
+  __ Frinta(s1, s17);
+  __ Frinta(s2, s18);
+  __ Frinta(s3, s19);
+  __ Frinta(s4, s20);
+  __ Frinta(s5, s21);
+  __ Frinta(s6, s22);
+  __ Frinta(s7, s23);
+  __ Frinta(s8, s24);
+  __ Frinta(s9, s25);
+  __ Frinta(s10, s26);
+
+  __ Fmov(d16, 1.0);
+  __ Fmov(d17, 1.1);
+  __ Fmov(d18, 1.5);
+  __ Fmov(d19, 1.9);
+  __ Fmov(d20, 2.5);
+  __ Fmov(d21, -1.5);
+  __ Fmov(d22, -2.5);
+  __ Fmov(d23, kFP32PositiveInfinity);
+  __ Fmov(d24, kFP32NegativeInfinity);
+  __ Fmov(d25, 0.0);
+  __ Fmov(d26, -0.0);
+
+  __ Frinta(d11, d16);
+  __ Frinta(d12, d17);
+  __ Frinta(d13, d18);
+  __ Frinta(d14, d19);
+  __ Frinta(d15, d20);
+  __ Frinta(d16, d21);
+  __ Frinta(d17, d22);
+  __ Frinta(d18, d23);
+  __ Frinta(d19, d24);
+  __ Frinta(d20, d25);
+  __ Frinta(d21, d26);
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_FP32(1.0, s0);
+  ASSERT_EQUAL_FP32(1.0, s1);
+  ASSERT_EQUAL_FP32(2.0, s2);
+  ASSERT_EQUAL_FP32(2.0, s3);
+  ASSERT_EQUAL_FP32(3.0, s4);
+  ASSERT_EQUAL_FP32(-2.0, s5);
+  ASSERT_EQUAL_FP32(-3.0, s6);
+  ASSERT_EQUAL_FP32(kFP32PositiveInfinity, s7);
+  ASSERT_EQUAL_FP32(kFP32NegativeInfinity, s8);
+  ASSERT_EQUAL_FP32(0.0, s9);
+  ASSERT_EQUAL_FP32(-0.0, s10);
+  ASSERT_EQUAL_FP64(1.0, d11);
+  ASSERT_EQUAL_FP64(1.0, d12);
+  ASSERT_EQUAL_FP64(2.0, d13);
+  ASSERT_EQUAL_FP64(2.0, d14);
+  ASSERT_EQUAL_FP64(3.0, d15);
+  ASSERT_EQUAL_FP64(-2.0, d16);
+  ASSERT_EQUAL_FP64(-3.0, d17);
+  ASSERT_EQUAL_FP64(kFP64PositiveInfinity, d18);
+  ASSERT_EQUAL_FP64(kFP64NegativeInfinity, d19);
+  ASSERT_EQUAL_FP64(0.0, d20);
+  ASSERT_EQUAL_FP64(-0.0, d21);
+
+  TEARDOWN();
+}
+
+
 TEST(frintn) {
   SETUP();
 
@@ -5011,7 +5647,7 @@ TEST(fcvt_sd) {
     {2.0, 2.0f},
     {FLT_MAX, FLT_MAX},
     //  - The smallest normalized float.
-    {pow(2, -126), pow(2, -126)},
+    {pow(2, -126), powf(2, -126)},
     //  - Normal floats that need (ties-to-even) rounding.
     //    For normalized numbers:
     //         bit 29 (0x0000000020000000) is the lowest-order bit which will
@@ -5109,6 +5745,209 @@ TEST(fcvt_sd) {
 }
 
 
+TEST(fcvtas) {
+  SETUP();
+
+  START();
+  __ Fmov(s0, 1.0);
+  __ Fmov(s1, 1.1);
+  __ Fmov(s2, 2.5);
+  __ Fmov(s3, -2.5);
+  __ Fmov(s4, kFP32PositiveInfinity);
+  __ Fmov(s5, kFP32NegativeInfinity);
+  __ Fmov(s6, 0x7fffff80);  // Largest float < INT32_MAX.
+  __ Fneg(s7, s6);          // Smallest float > INT32_MIN.
+  __ Fmov(d8, 1.0);
+  __ Fmov(d9, 1.1);
+  __ Fmov(d10, 2.5);
+  __ Fmov(d11, -2.5);
+  __ Fmov(d12, kFP64PositiveInfinity);
+  __ Fmov(d13, kFP64NegativeInfinity);
+  __ Fmov(d14, kWMaxInt - 1);
+  __ Fmov(d15, kWMinInt + 1);
+  __ Fmov(s17, 1.1);
+  __ Fmov(s18, 2.5);
+  __ Fmov(s19, -2.5);
+  __ Fmov(s20, kFP32PositiveInfinity);
+  __ Fmov(s21, kFP32NegativeInfinity);
+  __ Fmov(s22, 0x7fffff8000000000UL);   // Largest float < INT64_MAX.
+  __ Fneg(s23, s22);                    // Smallest float > INT64_MIN.
+  __ Fmov(d24, 1.1);
+  __ Fmov(d25, 2.5);
+  __ Fmov(d26, -2.5);
+  __ Fmov(d27, kFP64PositiveInfinity);
+  __ Fmov(d28, kFP64NegativeInfinity);
+  __ Fmov(d29, 0x7ffffffffffffc00UL);   // Largest double < INT64_MAX.
+  __ Fneg(d30, d29);                    // Smallest double > INT64_MIN.
+
+  __ Fcvtas(w0, s0);
+  __ Fcvtas(w1, s1);
+  __ Fcvtas(w2, s2);
+  __ Fcvtas(w3, s3);
+  __ Fcvtas(w4, s4);
+  __ Fcvtas(w5, s5);
+  __ Fcvtas(w6, s6);
+  __ Fcvtas(w7, s7);
+  __ Fcvtas(w8, d8);
+  __ Fcvtas(w9, d9);
+  __ Fcvtas(w10, d10);
+  __ Fcvtas(w11, d11);
+  __ Fcvtas(w12, d12);
+  __ Fcvtas(w13, d13);
+  __ Fcvtas(w14, d14);
+  __ Fcvtas(w15, d15);
+  __ Fcvtas(x17, s17);
+  __ Fcvtas(x18, s18);
+  __ Fcvtas(x19, s19);
+  __ Fcvtas(x20, s20);
+  __ Fcvtas(x21, s21);
+  __ Fcvtas(x22, s22);
+  __ Fcvtas(x23, s23);
+  __ Fcvtas(x24, d24);
+  __ Fcvtas(x25, d25);
+  __ Fcvtas(x26, d26);
+  __ Fcvtas(x27, d27);
+  __ Fcvtas(x28, d28);
+  __ Fcvtas(x29, d29);
+  __ Fcvtas(x30, d30);
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_64(1, x0);
+  ASSERT_EQUAL_64(1, x1);
+  ASSERT_EQUAL_64(3, x2);
+  ASSERT_EQUAL_64(0xfffffffd, x3);
+  ASSERT_EQUAL_64(0x7fffffff, x4);
+  ASSERT_EQUAL_64(0x80000000, x5);
+  ASSERT_EQUAL_64(0x7fffff80, x6);
+  ASSERT_EQUAL_64(0x80000080, x7);
+  ASSERT_EQUAL_64(1, x8);
+  ASSERT_EQUAL_64(1, x9);
+  ASSERT_EQUAL_64(3, x10);
+  ASSERT_EQUAL_64(0xfffffffd, x11);
+  ASSERT_EQUAL_64(0x7fffffff, x12);
+  ASSERT_EQUAL_64(0x80000000, x13);
+  ASSERT_EQUAL_64(0x7ffffffe, x14);
+  ASSERT_EQUAL_64(0x80000001, x15);
+  ASSERT_EQUAL_64(1, x17);
+  ASSERT_EQUAL_64(3, x18);
+  ASSERT_EQUAL_64(0xfffffffffffffffdUL, x19);
+  ASSERT_EQUAL_64(0x7fffffffffffffffUL, x20);
+  ASSERT_EQUAL_64(0x8000000000000000UL, x21);
+  ASSERT_EQUAL_64(0x7fffff8000000000UL, x22);
+  ASSERT_EQUAL_64(0x8000008000000000UL, x23);
+  ASSERT_EQUAL_64(1, x24);
+  ASSERT_EQUAL_64(3, x25);
+  ASSERT_EQUAL_64(0xfffffffffffffffdUL, x26);
+  ASSERT_EQUAL_64(0x7fffffffffffffffUL, x27);
+  ASSERT_EQUAL_64(0x8000000000000000UL, x28);
+  ASSERT_EQUAL_64(0x7ffffffffffffc00UL, x29);
+  ASSERT_EQUAL_64(0x8000000000000400UL, x30);
+
+  TEARDOWN();
+}
+
+
+TEST(fcvtau) {
+  SETUP();
+
+  START();
+  __ Fmov(s0, 1.0);
+  __ Fmov(s1, 1.1);
+  __ Fmov(s2, 2.5);
+  __ Fmov(s3, -2.5);
+  __ Fmov(s4, kFP32PositiveInfinity);
+  __ Fmov(s5, kFP32NegativeInfinity);
+  __ Fmov(s6, 0xffffff00);  // Largest float < UINT32_MAX.
+  __ Fmov(d8, 1.0);
+  __ Fmov(d9, 1.1);
+  __ Fmov(d10, 2.5);
+  __ Fmov(d11, -2.5);
+  __ Fmov(d12, kFP64PositiveInfinity);
+  __ Fmov(d13, kFP64NegativeInfinity);
+  __ Fmov(d14, 0xfffffffe);
+  __ Fmov(s16, 1.0);
+  __ Fmov(s17, 1.1);
+  __ Fmov(s18, 2.5);
+  __ Fmov(s19, -2.5);
+  __ Fmov(s20, kFP32PositiveInfinity);
+  __ Fmov(s21, kFP32NegativeInfinity);
+  __ Fmov(s22, 0xffffff0000000000UL);  // Largest float < UINT64_MAX.
+  __ Fmov(d24, 1.1);
+  __ Fmov(d25, 2.5);
+  __ Fmov(d26, -2.5);
+  __ Fmov(d27, kFP64PositiveInfinity);
+  __ Fmov(d28, kFP64NegativeInfinity);
+  __ Fmov(d29, 0xfffffffffffff800UL);  // Largest double < UINT64_MAX.
+  __ Fmov(s30, 0x100000000UL);
+
+  __ Fcvtau(w0, s0);
+  __ Fcvtau(w1, s1);
+  __ Fcvtau(w2, s2);
+  __ Fcvtau(w3, s3);
+  __ Fcvtau(w4, s4);
+  __ Fcvtau(w5, s5);
+  __ Fcvtau(w6, s6);
+  __ Fcvtau(w8, d8);
+  __ Fcvtau(w9, d9);
+  __ Fcvtau(w10, d10);
+  __ Fcvtau(w11, d11);
+  __ Fcvtau(w12, d12);
+  __ Fcvtau(w13, d13);
+  __ Fcvtau(w14, d14);
+  __ Fcvtau(w15, d15);
+  __ Fcvtau(x16, s16);
+  __ Fcvtau(x17, s17);
+  __ Fcvtau(x18, s18);
+  __ Fcvtau(x19, s19);
+  __ Fcvtau(x20, s20);
+  __ Fcvtau(x21, s21);
+  __ Fcvtau(x22, s22);
+  __ Fcvtau(x24, d24);
+  __ Fcvtau(x25, d25);
+  __ Fcvtau(x26, d26);
+  __ Fcvtau(x27, d27);
+  __ Fcvtau(x28, d28);
+  __ Fcvtau(x29, d29);
+  __ Fcvtau(w30, s30);
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_64(1, x0);
+  ASSERT_EQUAL_64(1, x1);
+  ASSERT_EQUAL_64(3, x2);
+  ASSERT_EQUAL_64(0, x3);
+  ASSERT_EQUAL_64(0xffffffff, x4);
+  ASSERT_EQUAL_64(0, x5);
+  ASSERT_EQUAL_64(0xffffff00, x6);
+  ASSERT_EQUAL_64(1, x8);
+  ASSERT_EQUAL_64(1, x9);
+  ASSERT_EQUAL_64(3, x10);
+  ASSERT_EQUAL_64(0, x11);
+  ASSERT_EQUAL_64(0xffffffff, x12);
+  ASSERT_EQUAL_64(0, x13);
+  ASSERT_EQUAL_64(0xfffffffe, x14);
+  ASSERT_EQUAL_64(1, x16);
+  ASSERT_EQUAL_64(1, x17);
+  ASSERT_EQUAL_64(3, x18);
+  ASSERT_EQUAL_64(0, x19);
+  ASSERT_EQUAL_64(0xffffffffffffffffUL, x20);
+  ASSERT_EQUAL_64(0, x21);
+  ASSERT_EQUAL_64(0xffffff0000000000UL, x22);
+  ASSERT_EQUAL_64(1, x24);
+  ASSERT_EQUAL_64(3, x25);
+  ASSERT_EQUAL_64(0, x26);
+  ASSERT_EQUAL_64(0xffffffffffffffffUL, x27);
+  ASSERT_EQUAL_64(0, x28);
+  ASSERT_EQUAL_64(0xfffffffffffff800UL, x29);
+  ASSERT_EQUAL_64(0xffffffff, x30);
+
+  TEARDOWN();
+}
+
+
 TEST(fcvtms) {
   SETUP();
 
@@ -5957,8 +6796,8 @@ static void TestUScvtf32Helper(uint64_t in,
   float expected_ucvtf_base = rawbits_to_float(expected_ucvtf_bits);
 
   for (int fbits = 0; fbits <= 32; fbits++) {
-    float expected_scvtf = expected_scvtf_base / pow(2, fbits);
-    float expected_ucvtf = expected_ucvtf_base / pow(2, fbits);
+    float expected_scvtf = expected_scvtf_base / powf(2, fbits);
+    float expected_ucvtf = expected_ucvtf_base / powf(2, fbits);
     ASSERT_EQUAL_FP32(expected_scvtf, results_scvtf_x[fbits]);
     ASSERT_EQUAL_FP32(expected_ucvtf, results_ucvtf_x[fbits]);
     if (cvtf_s32) ASSERT_EQUAL_FP32(expected_scvtf, results_scvtf_w[fbits]);
@@ -5967,8 +6806,8 @@ static void TestUScvtf32Helper(uint64_t in,
   }
   for (int fbits = 33; fbits <= 64; fbits++) {
     break;
-    float expected_scvtf = expected_scvtf_base / pow(2, fbits);
-    float expected_ucvtf = expected_ucvtf_base / pow(2, fbits);
+    float expected_scvtf = expected_scvtf_base / powf(2, fbits);
+    float expected_ucvtf = expected_ucvtf_base / powf(2, fbits);
     ASSERT_EQUAL_FP32(expected_scvtf, results_scvtf_x[fbits]);
     ASSERT_EQUAL_FP32(expected_ucvtf, results_ucvtf_x[fbits]);
   }
@@ -6062,7 +6901,7 @@ TEST(system_mrs) {
   __ Mrs(x4, NZCV);
 
   // Set the Z, C and V flags.
-  __ Add(w0, w2, w2, SetFlags);
+  __ Adds(w0, w2, w2);
   __ Mrs(x5, NZCV);
 
   // Read the default FPCR.
@@ -6257,31 +7096,31 @@ TEST(zero_dest_setflags) {
 
   // All of these instructions should only write to the flags in these forms,
   // but have alternate forms which can write into the stack pointer.
-  __ add(xzr, x0, Operand(x1, UXTX), SetFlags);
-  __ add(xzr, x1, Operand(xzr, UXTX), SetFlags);
-  __ add(xzr, x1, 1234, SetFlags);
-  __ add(xzr, x0, x1, SetFlags);
-  __ add(xzr, x1, xzr, SetFlags);
-  __ add(xzr, xzr, x1, SetFlags);
-
-  __ and_(xzr, x2, ~0xf, SetFlags);
-  __ and_(xzr, xzr, ~0xf, SetFlags);
-  __ and_(xzr, x0, x2, SetFlags);
-  __ and_(xzr, x2, xzr, SetFlags);
-  __ and_(xzr, xzr, x2, SetFlags);
-
-  __ bic(xzr, x3, ~0xf, SetFlags);
-  __ bic(xzr, xzr, ~0xf, SetFlags);
-  __ bic(xzr, x0, x3, SetFlags);
-  __ bic(xzr, x3, xzr, SetFlags);
-  __ bic(xzr, xzr, x3, SetFlags);
-
-  __ sub(xzr, x0, Operand(x3, UXTX), SetFlags);
-  __ sub(xzr, x3, Operand(xzr, UXTX), SetFlags);
-  __ sub(xzr, x3, 1234, SetFlags);
-  __ sub(xzr, x0, x3, SetFlags);
-  __ sub(xzr, x3, xzr, SetFlags);
-  __ sub(xzr, xzr, x3, SetFlags);
+  __ adds(xzr, x0, Operand(x1, UXTX));
+  __ adds(xzr, x1, Operand(xzr, UXTX));
+  __ adds(xzr, x1, 1234);
+  __ adds(xzr, x0, x1);
+  __ adds(xzr, x1, xzr);
+  __ adds(xzr, xzr, x1);
+
+  __ ands(xzr, x2, ~0xf);
+  __ ands(xzr, xzr, ~0xf);
+  __ ands(xzr, x0, x2);
+  __ ands(xzr, x2, xzr);
+  __ ands(xzr, xzr, x2);
+
+  __ bics(xzr, x3, ~0xf);
+  __ bics(xzr, xzr, ~0xf);
+  __ bics(xzr, x0, x3);
+  __ bics(xzr, x3, xzr);
+  __ bics(xzr, xzr, x3);
+
+  __ subs(xzr, x0, Operand(x3, UXTX));
+  __ subs(xzr, x3, Operand(xzr, UXTX));
+  __ subs(xzr, x3, 1234);
+  __ subs(xzr, x0, x3);
+  __ subs(xzr, x3, xzr);
+  __ subs(xzr, xzr, x3);
 
   // Swap the saved stack pointer with the real one. If sp was written
   // during the test, it will show up in x30. This is done because the test
@@ -7763,4 +8602,63 @@ TEST(blr_lr) {
   TEARDOWN();
 }
 
+
+TEST(barriers) {
+  // Generate all supported barriers, this is just a smoke test
+  SETUP();
+
+  START();
+
+  // DMB
+  __ Dmb(FullSystem, BarrierAll);
+  __ Dmb(FullSystem, BarrierReads);
+  __ Dmb(FullSystem, BarrierWrites);
+  __ Dmb(FullSystem, BarrierOther);
+
+  __ Dmb(InnerShareable, BarrierAll);
+  __ Dmb(InnerShareable, BarrierReads);
+  __ Dmb(InnerShareable, BarrierWrites);
+  __ Dmb(InnerShareable, BarrierOther);
+
+  __ Dmb(NonShareable, BarrierAll);
+  __ Dmb(NonShareable, BarrierReads);
+  __ Dmb(NonShareable, BarrierWrites);
+  __ Dmb(NonShareable, BarrierOther);
+
+  __ Dmb(OuterShareable, BarrierAll);
+  __ Dmb(OuterShareable, BarrierReads);
+  __ Dmb(OuterShareable, BarrierWrites);
+  __ Dmb(OuterShareable, BarrierOther);
+
+  // DSB
+  __ Dsb(FullSystem, BarrierAll);
+  __ Dsb(FullSystem, BarrierReads);
+  __ Dsb(FullSystem, BarrierWrites);
+  __ Dsb(FullSystem, BarrierOther);
+
+  __ Dsb(InnerShareable, BarrierAll);
+  __ Dsb(InnerShareable, BarrierReads);
+  __ Dsb(InnerShareable, BarrierWrites);
+  __ Dsb(InnerShareable, BarrierOther);
+
+  __ Dsb(NonShareable, BarrierAll);
+  __ Dsb(NonShareable, BarrierReads);
+  __ Dsb(NonShareable, BarrierWrites);
+  __ Dsb(NonShareable, BarrierOther);
+
+  __ Dsb(OuterShareable, BarrierAll);
+  __ Dsb(OuterShareable, BarrierReads);
+  __ Dsb(OuterShareable, BarrierWrites);
+  __ Dsb(OuterShareable, BarrierOther);
+
+  // ISB
+  __ Isb();
+
+  END();
+
+  RUN();
+
+  TEARDOWN();
+}
+
 }  // namespace vixl
diff --git a/test/test-disasm-a64.cc b/test/test-disasm-a64.cc
index 408da39d..c04819e6 100644
--- a/test/test-disasm-a64.cc
+++ b/test/test-disasm-a64.cc
@@ -119,8 +119,8 @@ TEST(mov_mvn) {
   COMPARE(Mov(w14, Operand(w15, SXTH, 2)), "sbfiz w14, w15, #2, #16");
   COMPARE(Mov(x16, Operand(x17, SXTW, 3)), "sbfiz x16, x17, #3, #32");
 
-  COMPARE(Mvn(w0, Operand(0x1)), "movn w0, #0x1");
-  COMPARE(Mvn(x1, Operand(0xfff)), "movn x1, #0xfff");
+  COMPARE(Mvn(w0, Operand(0x101)), "movn w0, #0x101");
+  COMPARE(Mvn(x1, Operand(0xfff1)), "movn x1, #0xfff1");
   COMPARE(Mvn(w2, Operand(w3)), "mvn w2, w3");
   COMPARE(Mvn(x4, Operand(x5)), "mvn x4, x5");
   COMPARE(Mvn(w6, Operand(w7, LSL, 12)), "mvn w6, w7, lsl #12");
@@ -165,6 +165,61 @@ TEST(move_immediate) {
   CLEANUP();
 }
 
+TEST(move_immediate_2) {
+  SETUP_CLASS(MacroAssembler);
+
+  // Move instructions expected for certain immediates. This is really a macro
+  // assembler test, to ensure it generates immediates efficiently.
+  COMPARE(Mov(w0, 0), "movz w0, #0x0");
+  COMPARE(Mov(w0, 0x0000ffff), "movz w0, #0xffff");
+  COMPARE(Mov(w0, 0x00010000), "movz w0, #0x10000");
+  COMPARE(Mov(w0, 0xffff0000), "movz w0, #0xffff0000");
+  COMPARE(Mov(w0, 0x0001ffff), "movn w0, #0xfffe0000");
+  COMPARE(Mov(w0, 0xffff8000), "movn w0, #0x7fff");
+  COMPARE(Mov(w0, 0xfffffffe), "movn w0, #0x1");
+  COMPARE(Mov(w0, 0xffffffff), "movn w0, #0x0");
+  COMPARE(Mov(w0, 0x00ffff00), "mov w0, #0xffff00");
+  COMPARE(Mov(w0, 0xfffe7fff), "mov w0, #0xfffe7fff");
+  COMPARE(Mov(w0, 0xfffeffff), "movn w0, #0x10000");
+  COMPARE(Mov(w0, 0xffff7fff), "movn w0, #0x8000");
+
+  COMPARE(Mov(x0, 0), "movz x0, #0x0");
+  COMPARE(Mov(x0, 0x0000ffff), "movz x0, #0xffff");
+  COMPARE(Mov(x0, 0x00010000), "movz x0, #0x10000");
+  COMPARE(Mov(x0, 0xffff0000), "movz x0, #0xffff0000");
+  COMPARE(Mov(x0, 0x0001ffff), "mov x0, #0x1ffff");
+  COMPARE(Mov(x0, 0xffff8000), "mov x0, #0xffff8000");
+  COMPARE(Mov(x0, 0xfffffffe), "mov x0, #0xfffffffe");
+  COMPARE(Mov(x0, 0xffffffff), "mov x0, #0xffffffff");
+  COMPARE(Mov(x0, 0x00ffff00), "mov x0, #0xffff00");
+  COMPARE(Mov(x0, 0xffff000000000000), "movz x0, #0xffff000000000000");
+  COMPARE(Mov(x0, 0x0000ffff00000000), "movz x0, #0xffff00000000");
+  COMPARE(Mov(x0, 0x00000000ffff0000), "movz x0, #0xffff0000");
+  COMPARE(Mov(x0, 0xffffffffffff0000), "movn x0, #0xffff");
+  COMPARE(Mov(x0, 0xffffffff0000ffff), "movn x0, #0xffff0000");
+  COMPARE(Mov(x0, 0xffff0000ffffffff), "movn x0, #0xffff00000000");
+  COMPARE(Mov(x0, 0x0000ffffffffffff), "movn x0, #0xffff000000000000");
+  COMPARE(Mov(x0, 0xfffe7fffffffffff), "mov x0, #0xfffe7fffffffffff");
+  COMPARE(Mov(x0, 0xfffeffffffffffff), "movn x0, #0x1000000000000");
+  COMPARE(Mov(x0, 0xffff7fffffffffff), "movn x0, #0x800000000000");
+  COMPARE(Mov(x0, 0xfffffffe7fffffff), "mov x0, #0xfffffffe7fffffff");
+  COMPARE(Mov(x0, 0xfffffffeffffffff), "movn x0, #0x100000000");
+  COMPARE(Mov(x0, 0xffffffff7fffffff), "movn x0, #0x80000000");
+  COMPARE(Mov(x0, 0xfffffffffffe7fff), "mov x0, #0xfffffffffffe7fff");
+  COMPARE(Mov(x0, 0xfffffffffffeffff), "movn x0, #0x10000");
+  COMPARE(Mov(x0, 0xffffffffffff7fff), "movn x0, #0x8000");
+  COMPARE(Mov(x0, 0xffffffffffffffff), "movn x0, #0x0");
+
+  COMPARE(Movk(w0, 0x1234, 0), "movk w0, #0x1234");
+  COMPARE(Movk(x1, 0x2345, 0), "movk x1, #0x2345");
+  COMPARE(Movk(w2, 0x3456, 16), "movk w2, #0x3456, lsl #16");
+  COMPARE(Movk(x3, 0x4567, 16), "movk x3, #0x4567, lsl #16");
+  COMPARE(Movk(x4, 0x5678, 32), "movk x4, #0x5678, lsl #32");
+  COMPARE(Movk(x5, 0x6789, 48), "movk x5, #0x6789, lsl #48");
+
+  CLEANUP();
+}
+
 TEST(add_immediate) {
   SETUP();
 
@@ -177,9 +232,9 @@ TEST(add_immediate) {
           "add x10, x11, #0x3ff000 (4190208)");
   COMPARE(add(w12, w13, Operand(0xfff000)),
           "add w12, w13, #0xfff000 (16773120)");
-  COMPARE(add(w14, w15, Operand(0xff), SetFlags), "adds w14, w15, #0xff (255)");
-  COMPARE(add(x16, x17, Operand(0xaa000), SetFlags),
-          "adds x16, x17, #0xaa000 (696320)");
+  COMPARE(adds(w14, w15, Operand(0xff)), "adds w14, w15, #0xff (255)");
+  COMPARE(adds(x16, x17, Operand(0xaa000)), "adds x16, x17, #0xaa000 (696320)");
+
   COMPARE(cmn(w18, Operand(0xff)), "cmn w18, #0xff (255)");
   COMPARE(cmn(x19, Operand(0xff000)), "cmn x19, #0xff000 (1044480)");
   COMPARE(add(w0, wsp, Operand(0)), "mov w0, wsp");
@@ -189,7 +244,7 @@ TEST(add_immediate) {
   COMPARE(add(x2, sp, Operand(16)), "add x2, sp, #0x10 (16)");
   COMPARE(add(wsp, wsp, Operand(42)), "add wsp, wsp, #0x2a (42)");
   COMPARE(cmn(sp, Operand(24)), "cmn sp, #0x18 (24)");
-  COMPARE(add(wzr, wsp, Operand(9), SetFlags), "cmn wsp, #0x9 (9)");
+  COMPARE(adds(wzr, wsp, Operand(9)), "cmn wsp, #0x9 (9)");
 
   CLEANUP();
 }
@@ -206,9 +261,8 @@ TEST(sub_immediate) {
           "sub x10, x11, #0x3ff000 (4190208)");
   COMPARE(sub(w12, w13, Operand(0xfff000)),
           "sub w12, w13, #0xfff000 (16773120)");
-  COMPARE(sub(w14, w15, Operand(0xff), SetFlags), "subs w14, w15, #0xff (255)");
-  COMPARE(sub(x16, x17, Operand(0xaa000), SetFlags),
-          "subs x16, x17, #0xaa000 (696320)");
+  COMPARE(subs(w14, w15, Operand(0xff)), "subs w14, w15, #0xff (255)");
+  COMPARE(subs(x16, x17, Operand(0xaa000)), "subs x16, x17, #0xaa000 (696320)");
   COMPARE(cmp(w18, Operand(0xff)), "cmp w18, #0xff (255)");
   COMPARE(cmp(x19, Operand(0xff000)), "cmp x19, #0xff000 (1044480)");
 
@@ -216,7 +270,7 @@ TEST(sub_immediate) {
   COMPARE(sub(x2, sp, Operand(16)), "sub x2, sp, #0x10 (16)");
   COMPARE(sub(wsp, wsp, Operand(42)), "sub wsp, wsp, #0x2a (42)");
   COMPARE(cmp(sp, Operand(24)), "cmp sp, #0x18 (24)");
-  COMPARE(sub(wzr, wsp, Operand(9), SetFlags), "cmp wsp, #0x9 (9)");
+  COMPARE(subs(wzr, wsp, Operand(9)), "cmp wsp, #0x9 (9)");
 
   CLEANUP();
 }
@@ -241,8 +295,8 @@ TEST(add_shifted) {
   COMPARE(add(x4, sp, Operand(x5, LSL, 1)), "add x4, sp, x5, lsl #1");
   COMPARE(add(x4, xzr, Operand(x5, LSL, 1)), "add x4, xzr, x5, lsl #1");
   COMPARE(add(w6, wsp, Operand(w7, LSL, 3)), "add w6, wsp, w7, lsl #3");
-  COMPARE(add(xzr, sp, Operand(x8, LSL, 4), SetFlags), "cmn sp, x8, lsl #4");
-  COMPARE(add(xzr, xzr, Operand(x8, LSL, 5), SetFlags), "cmn xzr, x8, lsl #5");
+  COMPARE(adds(xzr, sp, Operand(x8, LSL, 4)), "cmn sp, x8, lsl #4");
+  COMPARE(adds(xzr, xzr, Operand(x8, LSL, 5)), "cmn xzr, x8, lsl #5");
 
   CLEANUP();
 }
@@ -263,16 +317,16 @@ TEST(sub_shifted) {
   COMPARE(cmp(x26, Operand(x27, LSL, 63)), "cmp x26, x27, lsl #63");
   COMPARE(neg(w28, Operand(w29)), "neg w28, w29");
   COMPARE(neg(x30, Operand(x0, LSR, 62)), "neg x30, x0, lsr #62");
-  COMPARE(neg(w1, Operand(w2), SetFlags), "negs w1, w2");
-  COMPARE(neg(x3, Operand(x4, ASR, 61), SetFlags), "negs x3, x4, asr #61");
+  COMPARE(negs(w1, Operand(w2)), "negs w1, w2");
+  COMPARE(negs(x3, Operand(x4, ASR, 61)), "negs x3, x4, asr #61");
 
   COMPARE(sub(x0, sp, Operand(x1)), "sub x0, sp, x1");
   COMPARE(sub(w2, wsp, Operand(w3)), "sub w2, wsp, w3");
   COMPARE(sub(x4, sp, Operand(x5, LSL, 1)), "sub x4, sp, x5, lsl #1");
   COMPARE(sub(x4, xzr, Operand(x5, LSL, 1)), "neg x4, x5, lsl #1");
   COMPARE(sub(w6, wsp, Operand(w7, LSL, 3)), "sub w6, wsp, w7, lsl #3");
-  COMPARE(sub(xzr, sp, Operand(x8, LSL, 4), SetFlags), "cmp sp, x8, lsl #4");
-  COMPARE(sub(xzr, xzr, Operand(x8, LSL, 5), SetFlags), "cmp xzr, x8, lsl #5");
+  COMPARE(subs(xzr, sp, Operand(x8, LSL, 4)), "cmp sp, x8, lsl #4");
+  COMPARE(subs(xzr, xzr, Operand(x8, LSL, 5)), "cmp xzr, x8, lsl #5");
 
   CLEANUP();
 }
@@ -282,20 +336,15 @@ TEST(add_extended) {
   SETUP();
 
   COMPARE(add(w0, w1, Operand(w2, UXTB)), "add w0, w1, w2, uxtb");
-  COMPARE(add(x3, x4, Operand(w5, UXTB, 1), SetFlags),
-          "adds x3, x4, w5, uxtb #1");
+  COMPARE(adds(x3, x4, Operand(w5, UXTB, 1)), "adds x3, x4, w5, uxtb #1");
   COMPARE(add(w6, w7, Operand(w8, UXTH, 2)), "add w6, w7, w8, uxth #2");
-  COMPARE(add(x9, x10, Operand(x11, UXTW, 3), SetFlags),
-          "adds x9, x10, w11, uxtw #3");
+  COMPARE(adds(x9, x10, Operand(x11, UXTW, 3)), "adds x9, x10, w11, uxtw #3");
   COMPARE(add(x12, x13, Operand(x14, UXTX, 4)), "add x12, x13, x14, uxtx #4");
-  COMPARE(add(w15, w16, Operand(w17, SXTB, 4), SetFlags),
-          "adds w15, w16, w17, sxtb #4");
+  COMPARE(adds(w15, w16, Operand(w17, SXTB, 4)), "adds w15, w16, w17, sxtb #4");
   COMPARE(add(x18, x19, Operand(x20, SXTB, 3)), "add x18, x19, w20, sxtb #3");
-  COMPARE(add(w21, w22, Operand(w23, SXTH, 2), SetFlags),
-          "adds w21, w22, w23, sxth #2");
+  COMPARE(adds(w21, w22, Operand(w23, SXTH, 2)), "adds w21, w22, w23, sxth #2");
   COMPARE(add(x24, x25, Operand(x26, SXTW, 1)), "add x24, x25, w26, sxtw #1");
-  COMPARE(add(x27, x28, Operand(x29, SXTX), SetFlags),
-          "adds x27, x28, x29, sxtx");
+  COMPARE(adds(x27, x28, Operand(x29, SXTX)), "adds x27, x28, x29, sxtx");
   COMPARE(cmn(w0, Operand(w1, UXTB, 2)), "cmn w0, w1, uxtb #2");
   COMPARE(cmn(x2, Operand(x3, SXTH, 4)), "cmn x2, w3, sxth #4");
 
@@ -313,20 +362,15 @@ TEST(sub_extended) {
   SETUP();
 
   COMPARE(sub(w0, w1, Operand(w2, UXTB)), "sub w0, w1, w2, uxtb");
-  COMPARE(sub(x3, x4, Operand(w5, UXTB, 1), SetFlags),
-          "subs x3, x4, w5, uxtb #1");
+  COMPARE(subs(x3, x4, Operand(w5, UXTB, 1)), "subs x3, x4, w5, uxtb #1");
   COMPARE(sub(w6, w7, Operand(w8, UXTH, 2)), "sub w6, w7, w8, uxth #2");
-  COMPARE(sub(x9, x10, Operand(x11, UXTW, 3), SetFlags),
-          "subs x9, x10, w11, uxtw #3");
+  COMPARE(subs(x9, x10, Operand(x11, UXTW, 3)), "subs x9, x10, w11, uxtw #3");
   COMPARE(sub(x12, x13, Operand(x14, UXTX, 4)), "sub x12, x13, x14, uxtx #4");
-  COMPARE(sub(w15, w16, Operand(w17, SXTB, 4), SetFlags),
-          "subs w15, w16, w17, sxtb #4");
+  COMPARE(subs(w15, w16, Operand(w17, SXTB, 4)), "subs w15, w16, w17, sxtb #4");
   COMPARE(sub(x18, x19, Operand(x20, SXTB, 3)), "sub x18, x19, w20, sxtb #3");
-  COMPARE(sub(w21, w22, Operand(w23, SXTH, 2), SetFlags),
-          "subs w21, w22, w23, sxth #2");
+  COMPARE(subs(w21, w22, Operand(w23, SXTH, 2)), "subs w21, w22, w23, sxth #2");
   COMPARE(sub(x24, x25, Operand(x26, SXTW, 1)), "sub x24, x25, w26, sxtw #1");
-  COMPARE(sub(x27, x28, Operand(x29, SXTX), SetFlags),
-          "subs x27, x28, x29, sxtx");
+  COMPARE(subs(x27, x28, Operand(x29, SXTX)), "subs x27, x28, x29, sxtx");
   COMPARE(cmp(w0, Operand(w1, SXTB, 1)), "cmp w0, w1, sxtb #1");
   COMPARE(cmp(x2, Operand(x3, UXTH, 3)), "cmp x2, w3, uxth #3");
 
@@ -345,16 +389,16 @@ TEST(adc_subc_ngc) {
 
   COMPARE(adc(w0, w1, Operand(w2)), "adc w0, w1, w2");
   COMPARE(adc(x3, x4, Operand(x5)), "adc x3, x4, x5");
-  COMPARE(adc(w6, w7, Operand(w8), SetFlags), "adcs w6, w7, w8");
-  COMPARE(adc(x9, x10, Operand(x11), SetFlags), "adcs x9, x10, x11");
+  COMPARE(adcs(w6, w7, Operand(w8)), "adcs w6, w7, w8");
+  COMPARE(adcs(x9, x10, Operand(x11)), "adcs x9, x10, x11");
   COMPARE(sbc(w12, w13, Operand(w14)), "sbc w12, w13, w14");
   COMPARE(sbc(x15, x16, Operand(x17)), "sbc x15, x16, x17");
-  COMPARE(sbc(w18, w19, Operand(w20), SetFlags), "sbcs w18, w19, w20");
-  COMPARE(sbc(x21, x22, Operand(x23), SetFlags), "sbcs x21, x22, x23");
+  COMPARE(sbcs(w18, w19, Operand(w20)), "sbcs w18, w19, w20");
+  COMPARE(sbcs(x21, x22, Operand(x23)), "sbcs x21, x22, x23");
   COMPARE(ngc(w24, Operand(w25)), "ngc w24, w25");
   COMPARE(ngc(x26, Operand(x27)), "ngc x26, x27");
-  COMPARE(ngc(w28, Operand(w29), SetFlags), "ngcs w28, w29");
-  COMPARE(ngc(x30, Operand(x0), SetFlags), "ngcs x30, x0");
+  COMPARE(ngcs(w28, Operand(w29)), "ngcs w28, w29");
+  COMPARE(ngcs(x30, Operand(x0)), "ngcs x30, x0");
 
   CLEANUP();
 }
@@ -445,6 +489,10 @@ TEST(bitfield) {
   COMPARE(sxth(w4, w5), "sxth w4, w5");
   COMPARE(sxth(x6, x7), "sxth x6, w7");
   COMPARE(sxtw(x8, x9), "sxtw x8, w9");
+  COMPARE(sxtb(x0, w1), "sxtb x0, w1");
+  COMPARE(sxth(x2, w3), "sxth x2, w3");
+  COMPARE(sxtw(x4, w5), "sxtw x4, w5");
+
   COMPARE(uxtb(w10, w11), "uxtb w10, w11");
   COMPARE(uxtb(x12, x13), "uxtb x12, w13");
   COMPARE(uxth(w14, w15), "uxth w14, w15");
@@ -567,9 +615,8 @@ TEST(logical_immediate) {
           "eor w15, w16, #0x1");
   COMPARE(eor(x17, x18, Operand(0x0000000000000003L)),
           "eor x17, x18, #0x3");
-  COMPARE(and_(w23, w24, Operand(0x0000000f), SetFlags),
-          "ands w23, w24, #0xf");
-  COMPARE(and_(x25, x26, Operand(0x800000000000000fL), SetFlags),
+  COMPARE(ands(w23, w24, Operand(0x0000000f)), "ands w23, w24, #0xf");
+  COMPARE(ands(x25, x26, Operand(0x800000000000000fL)),
           "ands x25, x26, #0x800000000000000f");
 
   // Test inverse.
@@ -585,14 +632,13 @@ TEST(logical_immediate) {
           "eor w19, w20, #0x7ffffffe");
   COMPARE(eon(x21, x22, Operand(0xc000000000000003L)),
           "eor x21, x22, #0x3ffffffffffffffc");
-  COMPARE(bic(w27, w28, Operand(0xfffffff7), SetFlags),
-          "ands w27, w28, #0x8");
-  COMPARE(bic(x29, x0, Operand(0xfffffffeffffffffL), SetFlags),
+  COMPARE(bics(w27, w28, Operand(0xfffffff7)), "ands w27, w28, #0x8");
+  COMPARE(bics(x29, x0, Operand(0xfffffffeffffffffL)),
           "ands x29, x0, #0x100000000");
 
   // Test stack pointer.
   COMPARE(and_(wsp, wzr, Operand(7)), "and wsp, wzr, #0x7");
-  COMPARE(and_(xzr, xzr, Operand(7), SetFlags), "tst xzr, #0x7");
+  COMPARE(ands(xzr, xzr, Operand(7)), "tst xzr, #0x7");
   COMPARE(orr(sp, xzr, Operand(15)), "orr sp, xzr, #0xf");
   COMPARE(eor(wsp, w0, Operand(31)), "eor wsp, w0, #0x1f");
 
@@ -656,25 +702,17 @@ TEST(logical_shifted) {
   COMPARE(eon(x24, x25, Operand(x26, ASR, 23)), "eon x24, x25, x26, asr #23");
   COMPARE(eon(w27, w28, Operand(w29, ROR, 24)), "eon w27, w28, w29, ror #24");
 
-  COMPARE(and_(w0, w1, Operand(w2), SetFlags), "ands w0, w1, w2");
-  COMPARE(and_(x3, x4, Operand(x5, LSL, 1), SetFlags),
-          "ands x3, x4, x5, lsl #1");
-  COMPARE(and_(w6, w7, Operand(w8, LSR, 2), SetFlags),
-          "ands w6, w7, w8, lsr #2");
-  COMPARE(and_(x9, x10, Operand(x11, ASR, 3), SetFlags),
-          "ands x9, x10, x11, asr #3");
-  COMPARE(and_(w12, w13, Operand(w14, ROR, 4), SetFlags),
-          "ands w12, w13, w14, ror #4");
-
-  COMPARE(bic(w15, w16, Operand(w17), SetFlags), "bics w15, w16, w17");
-  COMPARE(bic(x18, x19, Operand(x20, LSL, 5), SetFlags),
-          "bics x18, x19, x20, lsl #5");
-  COMPARE(bic(w21, w22, Operand(w23, LSR, 6), SetFlags),
-          "bics w21, w22, w23, lsr #6");
-  COMPARE(bic(x24, x25, Operand(x26, ASR, 7), SetFlags),
-          "bics x24, x25, x26, asr #7");
-  COMPARE(bic(w27, w28, Operand(w29, ROR, 8), SetFlags),
-          "bics w27, w28, w29, ror #8");
+  COMPARE(ands(w0, w1, Operand(w2)), "ands w0, w1, w2");
+  COMPARE(ands(x3, x4, Operand(x5, LSL, 1)), "ands x3, x4, x5, lsl #1");
+  COMPARE(ands(w6, w7, Operand(w8, LSR, 2)), "ands w6, w7, w8, lsr #2");
+  COMPARE(ands(x9, x10, Operand(x11, ASR, 3)), "ands x9, x10, x11, asr #3");
+  COMPARE(ands(w12, w13, Operand(w14, ROR, 4)), "ands w12, w13, w14, ror #4");
+
+  COMPARE(bics(w15, w16, Operand(w17)), "bics w15, w16, w17");
+  COMPARE(bics(x18, x19, Operand(x20, LSL, 5)), "bics x18, x19, x20, lsl #5");
+  COMPARE(bics(w21, w22, Operand(w23, LSR, 6)), "bics w21, w22, w23, lsr #6");
+  COMPARE(bics(x24, x25, Operand(x26, ASR, 7)), "bics x24, x25, x26, asr #7");
+  COMPARE(bics(w27, w28, Operand(w29, ROR, 8)), "bics w27, w28, w29, ror #8");
 
   COMPARE(tst(w0, Operand(w1)), "tst w0, w1");
   COMPARE(tst(w2, Operand(w3, ROR, 10)), "tst w2, w3, ror #10");
@@ -745,11 +783,16 @@ TEST(branch) {
   COMPARE(cbz(x1, INST_OFF(-0x100000)), "cbz x1, #-0x100000");
   COMPARE(cbnz(w2, INST_OFF(0xffffc)), "cbnz w2, #+0xffffc");
   COMPARE(cbnz(x3, INST_OFF(-0x100000)), "cbnz x3, #-0x100000");
-  COMPARE(tbz(x4, 0, INST_OFF(0x7ffc)), "tbz w4, #0, #+0x7ffc");
+  COMPARE(tbz(w4, 0, INST_OFF(0x7ffc)), "tbz w4, #0, #+0x7ffc");
   COMPARE(tbz(x5, 63, INST_OFF(-0x8000)), "tbz x5, #63, #-0x8000");
-  COMPARE(tbnz(x6, 0, INST_OFF(0x7ffc)), "tbnz w6, #0, #+0x7ffc");
-  COMPARE(tbnz(x7, 63, INST_OFF(-0x8000)), "tbnz x7, #63, #-0x8000");
-
+  COMPARE(tbz(w6, 31, INST_OFF(0)), "tbz w6, #31, #+0x0");
+  COMPARE(tbz(x7, 31, INST_OFF(0x4)), "tbz w7, #31, #+0x4");
+  COMPARE(tbz(x8, 32, INST_OFF(0x8)), "tbz x8, #32, #+0x8");
+  COMPARE(tbnz(w8, 0, INST_OFF(0x7ffc)), "tbnz w8, #0, #+0x7ffc");
+  COMPARE(tbnz(x9, 63, INST_OFF(-0x8000)), "tbnz x9, #63, #-0x8000");
+  COMPARE(tbnz(w10, 31, INST_OFF(0)), "tbnz w10, #31, #+0x0");
+  COMPARE(tbnz(x11, 31, INST_OFF(0x4)), "tbnz w11, #31, #+0x4");
+  COMPARE(tbnz(x12, 32, INST_OFF(0x8)), "tbnz x12, #32, #+0x8");
   COMPARE(br(x0), "br x0");
   COMPARE(blr(x1), "blr x1");
   COMPARE(ret(x2), "ret x2");
@@ -1229,6 +1272,19 @@ TEST(cond_select) {
   CLEANUP();
 }
 
+TEST(cond_select_macro) {
+  SETUP_CLASS(MacroAssembler);
+
+  COMPARE(Csel(w0, w1, -1, eq), "csinv w0, w1, wzr, eq");
+  COMPARE(Csel(w2, w3, 0, ne), "csel w2, w3, wzr, ne");
+  COMPARE(Csel(w4, w5, 1, hs), "csinc w4, w5, wzr, hs");
+  COMPARE(Csel(x6, x7, -1, lo), "csinv x6, x7, xzr, lo");
+  COMPARE(Csel(x8, x9, 0, mi), "csel x8, x9, xzr, mi");
+  COMPARE(Csel(x10, x11, 1, pl), "csinc x10, x11, xzr, pl");
+
+  CLEANUP();
+}
+
 TEST(cond_cmp) {
   SETUP();
 
@@ -1246,6 +1302,17 @@ TEST(cond_cmp) {
   CLEANUP();
 }
 
+TEST(cond_cmp_macro) {
+  SETUP_CLASS(MacroAssembler);
+
+  COMPARE(Ccmp(w0, -1, VFlag, hi), "ccmn w0, #1, #nzcV, hi");
+  COMPARE(Ccmp(x1, -31, CFlag, ge), "ccmn x1, #31, #nzCv, ge");
+  COMPARE(Ccmn(w2, -1, CVFlag, gt), "ccmp w2, #1, #nzCV, gt");
+  COMPARE(Ccmn(x3, -31, ZCVFlag, ls), "ccmp x3, #31, #nZCV, ls");
+
+  CLEANUP();
+}
+
 TEST(fmov_imm) {
   SETUP();
 
@@ -1286,6 +1353,10 @@ TEST(fp_dp1) {
   COMPARE(fsqrt(s31, s30), "fsqrt s31, s30");
   COMPARE(fsqrt(d10, d11), "fsqrt d10, d11");
   COMPARE(fsqrt(d31, d30), "fsqrt d31, d30");
+  COMPARE(frinta(s10, s11), "frinta s10, s11");
+  COMPARE(frinta(s31, s30), "frinta s31, s30");
+  COMPARE(frinta(d12, d13), "frinta d12, d13");
+  COMPARE(frinta(d31, d30), "frinta d31, d30");
   COMPARE(frintn(s10, s11), "frintn s10, s11");
   COMPARE(frintn(s31, s30), "frintn s31, s30");
   COMPARE(frintn(d12, d13), "frintn d12, d13");
@@ -1316,6 +1387,10 @@ TEST(fp_dp2) {
   COMPARE(fmax(d22, d23, d24), "fmax d22, d23, d24");
   COMPARE(fmin(s25, s26, s27), "fmin s25, s26, s27");
   COMPARE(fmin(d28, d29, d30), "fmin d28, d29, d30");
+  COMPARE(fmaxnm(s31, s0, s1), "fmaxnm s31, s0, s1");
+  COMPARE(fmaxnm(d2, d3, d4), "fmaxnm d2, d3, d4");
+  COMPARE(fminnm(s5, s6, s7), "fminnm s5, s6, s7");
+  COMPARE(fminnm(d8, d9, d10), "fminnm d8, d9, d10");
 
   CLEANUP();
 }
@@ -1324,9 +1399,16 @@ TEST(fp_dp2) {
 TEST(fp_dp3) {
   SETUP();
 
+  COMPARE(fmadd(s7, s8, s9, s10), "fmadd s7, s8, s9, s10");
+  COMPARE(fmadd(d10, d11, d12, d10), "fmadd d10, d11, d12, d10");
   COMPARE(fmsub(s7, s8, s9, s10), "fmsub s7, s8, s9, s10");
   COMPARE(fmsub(d10, d11, d12, d10), "fmsub d10, d11, d12, d10");
 
+  COMPARE(fnmadd(s7, s8, s9, s10), "fnmadd s7, s8, s9, s10");
+  COMPARE(fnmadd(d10, d11, d12, d10), "fnmadd d10, d11, d12, d10");
+  COMPARE(fnmsub(s7, s8, s9, s10), "fnmsub s7, s8, s9, s10");
+  COMPARE(fnmsub(d10, d11, d12, d10), "fnmsub d10, d11, d12, d10");
+
   CLEANUP();
 }
 
@@ -1380,6 +1462,14 @@ TEST(fp_select) {
 TEST(fcvt_scvtf_ucvtf) {
   SETUP();
 
+  COMPARE(fcvtas(w0, s1), "fcvtas w0, s1");
+  COMPARE(fcvtas(x2, s3), "fcvtas x2, s3");
+  COMPARE(fcvtas(w4, d5), "fcvtas w4, d5");
+  COMPARE(fcvtas(x6, d7), "fcvtas x6, d7");
+  COMPARE(fcvtau(w8, s9), "fcvtau w8, s9");
+  COMPARE(fcvtau(x10, s11), "fcvtau x10, s11");
+  COMPARE(fcvtau(w12, d13), "fcvtau w12, d13");
+  COMPARE(fcvtau(x14, d15), "fcvtau x14, d15");
   COMPARE(fcvtns(w0, s1), "fcvtns w0, s1");
   COMPARE(fcvtns(x2, s3), "fcvtns x2, s3");
   COMPARE(fcvtns(w4, d5), "fcvtns w4, d5");
@@ -1545,6 +1635,16 @@ TEST(add_sub_negative) {
   COMPARE(Sub(w21, w3, -0xbc), "add w21, w3, #0xbc (188)");
   COMPARE(Sub(w22, w4, -2000), "add w22, w4, #0x7d0 (2000)");
 
+  COMPARE(Cmp(w0, -1), "cmn w0, #0x1 (1)");
+  COMPARE(Cmp(x1, -1), "cmn x1, #0x1 (1)");
+  COMPARE(Cmp(w2, -4095), "cmn w2, #0xfff (4095)");
+  COMPARE(Cmp(x3, -4095), "cmn x3, #0xfff (4095)");
+
+  COMPARE(Cmn(w0, -1), "cmp w0, #0x1 (1)");
+  COMPARE(Cmn(x1, -1), "cmp x1, #0x1 (1)");
+  COMPARE(Cmn(w2, -4095), "cmp w2, #0xfff (4095)");
+  COMPARE(Cmn(x3, -4095), "cmp x3, #0xfff (4095)");
+
   CLEANUP();
 }
 
@@ -1586,4 +1686,57 @@ TEST(logical_immediate_move) {
 
   CLEANUP();
 }
+
+TEST(barriers) {
+  SETUP_CLASS(MacroAssembler);
+
+  // DMB
+  COMPARE(Dmb(FullSystem, BarrierAll), "dmb sy");
+  COMPARE(Dmb(FullSystem, BarrierReads), "dmb ld");
+  COMPARE(Dmb(FullSystem, BarrierWrites), "dmb st");
+
+  COMPARE(Dmb(InnerShareable, BarrierAll), "dmb ish");
+  COMPARE(Dmb(InnerShareable, BarrierReads), "dmb ishld");
+  COMPARE(Dmb(InnerShareable, BarrierWrites), "dmb ishst");
+
+  COMPARE(Dmb(NonShareable, BarrierAll), "dmb nsh");
+  COMPARE(Dmb(NonShareable, BarrierReads), "dmb nshld");
+  COMPARE(Dmb(NonShareable, BarrierWrites), "dmb nshst");
+
+  COMPARE(Dmb(OuterShareable, BarrierAll), "dmb osh");
+  COMPARE(Dmb(OuterShareable, BarrierReads), "dmb oshld");
+  COMPARE(Dmb(OuterShareable, BarrierWrites), "dmb oshst");
+
+  COMPARE(Dmb(FullSystem, BarrierOther), "dmb sy (0b1100)");
+  COMPARE(Dmb(InnerShareable, BarrierOther), "dmb sy (0b1000)");
+  COMPARE(Dmb(NonShareable, BarrierOther), "dmb sy (0b0100)");
+  COMPARE(Dmb(OuterShareable, BarrierOther), "dmb sy (0b0000)");
+
+  // DSB
+  COMPARE(Dsb(FullSystem, BarrierAll), "dsb sy");
+  COMPARE(Dsb(FullSystem, BarrierReads), "dsb ld");
+  COMPARE(Dsb(FullSystem, BarrierWrites), "dsb st");
+
+  COMPARE(Dsb(InnerShareable, BarrierAll), "dsb ish");
+  COMPARE(Dsb(InnerShareable, BarrierReads), "dsb ishld");
+  COMPARE(Dsb(InnerShareable, BarrierWrites), "dsb ishst");
+
+  COMPARE(Dsb(NonShareable, BarrierAll), "dsb nsh");
+  COMPARE(Dsb(NonShareable, BarrierReads), "dsb nshld");
+  COMPARE(Dsb(NonShareable, BarrierWrites), "dsb nshst");
+
+  COMPARE(Dsb(OuterShareable, BarrierAll), "dsb osh");
+  COMPARE(Dsb(OuterShareable, BarrierReads), "dsb oshld");
+  COMPARE(Dsb(OuterShareable, BarrierWrites), "dsb oshst");
+
+  COMPARE(Dsb(FullSystem, BarrierOther), "dsb sy (0b1100)");
+  COMPARE(Dsb(InnerShareable, BarrierOther), "dsb sy (0b1000)");
+  COMPARE(Dsb(NonShareable, BarrierOther), "dsb sy (0b0100)");
+  COMPARE(Dsb(OuterShareable, BarrierOther), "dsb sy (0b0000)");
+
+  // ISB
+  COMPARE(Isb(), "isb");
+
+  CLEANUP();
+}
 }  // namespace vixl
diff --git a/tools/make_instruction_doc.pl b/tools/make_instruction_doc.pl
index a244962c..5457c38e 100755
--- a/tools/make_instruction_doc.pl
+++ b/tools/make_instruction_doc.pl
@@ -39,7 +39,7 @@
 while(<IN>)
 {
   # Find a function formatted like an instruction.
-  if(my($t) = /^  ((?:void|inline void) [a-z0-9]{1,6})\(/mgp)
+  if(my($t) = /^  ((?:void|inline void) [a-z0-9]{1,6}_?)\(/mgp)
   {
     my $before = ${^PREMATCH};
     my $after = ${^POSTMATCH};