update

[l4.git] / l4 / pkg / valgrind / src / valgrind-3.6.0-svn / VEX / priv / guest_arm_toIR.c
diff --git a/l4/pkg/valgrind/src/valgrind-3.6.0-svn/VEX/priv/guest_arm_toIR.c b/l4/pkg/valgrind/src/valgrind-3.6.0-svn/VEX/priv/guest_arm_toIR.c

index e570ba80ae2695cbd512e2883f5f1c7e2de235f1..684f88138bc85f4cb6a041c18ab00966d6eb6c0d 100644 (file)
--- a/l4/pkg/valgrind/src/valgrind-3.6.0-svn/VEX/priv/guest_arm_toIR.c
+++ b/l4/pkg/valgrind/src/valgrind-3.6.0-svn/VEX/priv/guest_arm_toIR.c
@@ -9,8 +9,11 @@
  
     Copyright (C) 2004-2010 OpenWorks LLP
        info@open-works.net
-   Copyright (C) 2010-2010 Dmitry Zhurikhin
-      zhur@ispras.ru
+
+   NEON support is
+   Copyright (C) 2010-2010 Samsung Electronics
+   contributed by Dmitry Zhurikhin <zhur@ispras.ru>
+              and Kirill Batuzov <batuzovk@ispras.ru>
  
     This program is free software; you can redistribute it and/or
     modify it under the terms of the GNU General Public License as
@@ -254,6 +257,10 @@ static UInt setbit32 ( UInt x, Int ix, UInt b )
     (((_b8) << 8) \
      | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
  
+#define BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
+   (((_b9) << 9) | ((_b8) << 8)                                \
+    | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
+
  /* produces _uint[_bMax:_bMin] */
  #define SLICE_UInt(_uint,_bMax,_bMin) \
     (( ((UInt)(_uint)) >> (_bMin)) \
@@ -427,6 +434,10 @@ static IRExpr* align4if ( IRExpr* e, Bool b )
  #define OFFB_TPIDRURO offsetof(VexGuestARMState,guest_TPIDRURO)
  #define OFFB_ITSTATE  offsetof(VexGuestARMState,guest_ITSTATE)
  #define OFFB_QFLAG32  offsetof(VexGuestARMState,guest_QFLAG32)
+#define OFFB_GEFLAG0  offsetof(VexGuestARMState,guest_GEFLAG0)
+#define OFFB_GEFLAG1  offsetof(VexGuestARMState,guest_GEFLAG1)
+#define OFFB_GEFLAG2  offsetof(VexGuestARMState,guest_GEFLAG2)
+#define OFFB_GEFLAG3  offsetof(VexGuestARMState,guest_GEFLAG3)
  
  
  /* ---------------- Integer registers ---------------- */
@@ -853,6 +864,10 @@ static void putMiscReg32 ( UInt    gsoffset,
     switch (gsoffset) {
        case OFFB_FPSCR:   break;
        case OFFB_QFLAG32: break;
+      case OFFB_GEFLAG0: break;
+      case OFFB_GEFLAG1: break;
+      case OFFB_GEFLAG2: break;
+      case OFFB_GEFLAG3: break;
        default: vassert(0); /* awaiting more cases */
     }
     vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
@@ -897,14 +912,98 @@ static void put_QFLAG32 ( IRTemp t, IRTemp condT )
     putMiscReg32( OFFB_QFLAG32, mkexpr(t), condT );
  }
  
-static void or_into_QFLAG32 ( IRTemp t, IRTemp condT )
+/* Stickily set the 'Q' flag (APSR bit 27) of the APSR (Application Program
+   Status Register) to indicate that overflow or saturation occurred.
+   Nb: t must be zero to denote no saturation, and any nonzero
+   value to indicate saturation. */
+static void or_into_QFLAG32 ( IRExpr* e, IRTemp condT )
  {
     IRTemp old = get_QFLAG32();
     IRTemp nyu = newTemp(Ity_I32);
-   assign(nyu, binop(Iop_Or32, mkexpr(old), mkexpr(t)) );
+   assign(nyu, binop(Iop_Or32, mkexpr(old), e) );
     put_QFLAG32(nyu, condT);
  }
  
+/* Generate code to set APSR.GE[flagNo]. Each fn call sets 1 bit.
+   flagNo: which flag bit to set [3...0]
+   lowbits_to_ignore:  0 = look at all 32 bits
+                       8 = look at top 24 bits only
+                      16 = look at top 16 bits only
+                      31 = look at the top bit only
+   e: input value to be evaluated.
+   The new value is taken from 'e' with the lowest 'lowbits_to_ignore'
+   masked out.  If the resulting value is zero then the GE flag is
+   set to 0; any other value sets the flag to 1. */
+static void put_GEFLAG32 ( Int flagNo,            /* 0, 1, 2 or 3 */
+                           Int lowbits_to_ignore, /* 0, 8, 16 or 31   */
+                           IRExpr* e,             /* Ity_I32 */
+                           IRTemp condT )
+{
+   vassert( flagNo >= 0 && flagNo <= 3 );
+   vassert( lowbits_to_ignore == 0  || 
+            lowbits_to_ignore == 8  || 
+            lowbits_to_ignore == 16 ||
+            lowbits_to_ignore == 31 );
+   IRTemp masked = newTemp(Ity_I32);
+   assign(masked, binop(Iop_Shr32, e, mkU8(lowbits_to_ignore)));
+ 
+   switch (flagNo) {
+      case 0: putMiscReg32(OFFB_GEFLAG0, mkexpr(masked), condT); break;
+      case 1: putMiscReg32(OFFB_GEFLAG1, mkexpr(masked), condT); break;
+      case 2: putMiscReg32(OFFB_GEFLAG2, mkexpr(masked), condT); break;
+      case 3: putMiscReg32(OFFB_GEFLAG3, mkexpr(masked), condT); break;
+      default: vassert(0);
+   }
+}
+
+/* Return the (32-bit, zero-or-nonzero representation scheme) of
+   the specified GE flag. */
+static IRExpr* get_GEFLAG32( Int flagNo /* 0, 1, 2, 3 */ )
+{
+   switch (flagNo) {
+      case 0: return IRExpr_Get( OFFB_GEFLAG0, Ity_I32 );
+      case 1: return IRExpr_Get( OFFB_GEFLAG1, Ity_I32 );
+      case 2: return IRExpr_Get( OFFB_GEFLAG2, Ity_I32 );
+      case 3: return IRExpr_Get( OFFB_GEFLAG3, Ity_I32 );
+      default: vassert(0);
+   }
+}
+
+/* Set all 4 GE flags from the given 32-bit value as follows: GE 3 and
+   2 are set from bit 31 of the value, and GE 1 and 0 are set from bit
+   15 of the value.  All other bits are ignored. */
+static void set_GE_32_10_from_bits_31_15 ( IRTemp t32, IRTemp condT )
+{
+   IRTemp ge10 = newTemp(Ity_I32);
+   IRTemp ge32 = newTemp(Ity_I32);
+   assign(ge10, binop(Iop_And32, mkexpr(t32), mkU32(0x00008000)));
+   assign(ge32, binop(Iop_And32, mkexpr(t32), mkU32(0x80000000)));
+   put_GEFLAG32( 0, 0, mkexpr(ge10), condT );
+   put_GEFLAG32( 1, 0, mkexpr(ge10), condT );
+   put_GEFLAG32( 2, 0, mkexpr(ge32), condT );
+   put_GEFLAG32( 3, 0, mkexpr(ge32), condT );
+}
+
+
+/* Set all 4 GE flags from the given 32-bit value as follows: GE 3
+   from bit 31, GE 2 from bit 23, GE 1 from bit 15, and GE0 from
+   bit 7.  All other bits are ignored. */
+static void set_GE_3_2_1_0_from_bits_31_23_15_7 ( IRTemp t32, IRTemp condT )
+{
+   IRTemp ge0 = newTemp(Ity_I32);
+   IRTemp ge1 = newTemp(Ity_I32);
+   IRTemp ge2 = newTemp(Ity_I32);
+   IRTemp ge3 = newTemp(Ity_I32);
+   assign(ge0, binop(Iop_And32, mkexpr(t32), mkU32(0x00000080)));
+   assign(ge1, binop(Iop_And32, mkexpr(t32), mkU32(0x00008000)));
+   assign(ge2, binop(Iop_And32, mkexpr(t32), mkU32(0x00800000)));
+   assign(ge3, binop(Iop_And32, mkexpr(t32), mkU32(0x80000000)));
+   put_GEFLAG32( 0, 0, mkexpr(ge0), condT );
+   put_GEFLAG32( 1, 0, mkexpr(ge1), condT );
+   put_GEFLAG32( 2, 0, mkexpr(ge2), condT );
+   put_GEFLAG32( 3, 0, mkexpr(ge3), condT );
+}
+
  
  /* ---------------- FPSCR stuff ---------------- */
  
@@ -989,14 +1088,13 @@ static HChar* nCC ( ARMCondcode cond ) {
  static IRExpr* mk_armg_calculate_condition_dyn ( IRExpr* cond )
  {
     vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I32);
-   /* And 'cond' had better produce a value in which only bits 7:4
-      bits are nonzero.  However, obviously we can't assert for
-      that. */
+   /* And 'cond' had better produce a value in which only bits 7:4 are
+      nonzero.  However, obviously we can't assert for that. */
  
     /* So what we're constructing for the first argument is 
-      "(cond << 4) | stored-operation-operation".  However,
-      as per comments above, must be supplied pre-shifted to this
-      function.
+      "(cond << 4) | stored-operation".
+      However, as per comments above, 'cond' must be supplied
+      pre-shifted to this function.
  
        This pairing scheme requires that the ARM_CC_OP_ values all fit
        in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
@@ -1391,6 +1489,200 @@ static void gen_SIGILL_T_if_in_ITBlock (
  }
  
  
+/* Generate an APSR value, from the NZCV thunk, and
+   from QFLAG32 and GEFLAG0 .. GEFLAG3. */
+static IRTemp synthesise_APSR ( void )
+{
+   IRTemp res1 = newTemp(Ity_I32);
+   // Get NZCV
+   assign( res1, mk_armg_calculate_flags_nzcv() );
+   // OR in the Q value
+   IRTemp res2 = newTemp(Ity_I32);
+   assign(
+      res2,
+      binop(Iop_Or32,
+            mkexpr(res1),
+            binop(Iop_Shl32,
+                  unop(Iop_1Uto32,
+                       binop(Iop_CmpNE32,
+                             mkexpr(get_QFLAG32()),
+                             mkU32(0))),
+                  mkU8(ARMG_CC_SHIFT_Q)))
+   );
+   // OR in GE0 .. GE3
+   IRExpr* ge0
+      = unop(Iop_1Uto32, binop(Iop_CmpNE32, get_GEFLAG32(0), mkU32(0)));
+   IRExpr* ge1
+      = unop(Iop_1Uto32, binop(Iop_CmpNE32, get_GEFLAG32(1), mkU32(0)));
+   IRExpr* ge2
+      = unop(Iop_1Uto32, binop(Iop_CmpNE32, get_GEFLAG32(2), mkU32(0)));
+   IRExpr* ge3
+      = unop(Iop_1Uto32, binop(Iop_CmpNE32, get_GEFLAG32(3), mkU32(0)));
+   IRTemp res3 = newTemp(Ity_I32);
+   assign(res3,
+          binop(Iop_Or32,
+                mkexpr(res2),
+                binop(Iop_Or32,
+                      binop(Iop_Or32,
+                            binop(Iop_Shl32, ge0, mkU8(16)),
+                            binop(Iop_Shl32, ge1, mkU8(17))),
+                      binop(Iop_Or32,
+                            binop(Iop_Shl32, ge2, mkU8(18)),
+                            binop(Iop_Shl32, ge3, mkU8(19))) )));
+   return res3;
+}
+
+
+/* and the inverse transformation: given an APSR value,
+   set the NZCV thunk, the Q flag, and the GE flags. */
+static void desynthesise_APSR ( Bool write_nzcvq, Bool write_ge,
+                                IRTemp apsrT, IRTemp condT )
+{
+   vassert(write_nzcvq || write_ge);
+   if (write_nzcvq) {
+      // Do NZCV
+      IRTemp immT = newTemp(Ity_I32);
+      assign(immT, binop(Iop_And32, mkexpr(apsrT), mkU32(0xF0000000)) );
+      setFlags_D1(ARMG_CC_OP_COPY, immT, condT);
+      // Do Q
+      IRTemp qnewT = newTemp(Ity_I32);
+      assign(qnewT, binop(Iop_And32, mkexpr(apsrT), mkU32(ARMG_CC_MASK_Q)));
+      put_QFLAG32(qnewT, condT);
+   }
+   if (write_ge) {
+      // Do GE3..0
+      put_GEFLAG32(0, 0, binop(Iop_And32, mkexpr(apsrT), mkU32(1<<16)),
+                   condT);
+      put_GEFLAG32(1, 0, binop(Iop_And32, mkexpr(apsrT), mkU32(1<<17)),
+                   condT);
+      put_GEFLAG32(2, 0, binop(Iop_And32, mkexpr(apsrT), mkU32(1<<18)),
+                   condT);
+      put_GEFLAG32(3, 0, binop(Iop_And32, mkexpr(apsrT), mkU32(1<<19)),
+                   condT);
+   }
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for saturation                               ---*/
+/*------------------------------------------------------------*/
+
+/* FIXME: absolutely the only diff. between (a) armUnsignedSatQ and
+   (b) armSignedSatQ is that in (a) the floor is set to 0, whereas in
+   (b) the floor is computed from the value of imm5.  these two fnsn
+   should be commoned up. */
+
+/* UnsignedSatQ(): 'clamp' each value so it lies between 0 <= x <= (2^N)-1
+   Optionally return flag resQ saying whether saturation occurred.
+   See definition in manual, section A2.2.1, page 41
+   (bits(N), boolean) UnsignedSatQ( integer i, integer N )
+   {
+     if ( i > (2^N)-1 ) { result = (2^N)-1; saturated = TRUE; }
+     elsif ( i < 0 )    { result = 0; saturated = TRUE; }
+     else               { result = i; saturated = FALSE; }
+     return ( result<N-1:0>, saturated );
+   }
+*/
+static void armUnsignedSatQ( IRTemp* res,  /* OUT - Ity_I32 */
+                             IRTemp* resQ, /* OUT - Ity_I32  */
+                             IRTemp regT,  /* value to clamp - Ity_I32 */
+                             UInt imm5 )   /* saturation ceiling */
+{
+   UInt ceil  = (1 << imm5) - 1;    // (2^imm5)-1
+   UInt floor = 0;
+
+   IRTemp node0 = newTemp(Ity_I32);
+   IRTemp node1 = newTemp(Ity_I32);
+   IRTemp node2 = newTemp(Ity_I1);
+   IRTemp node3 = newTemp(Ity_I32);
+   IRTemp node4 = newTemp(Ity_I32);
+   IRTemp node5 = newTemp(Ity_I1);
+   IRTemp node6 = newTemp(Ity_I32);
+
+   assign( node0, mkexpr(regT) );
+   assign( node1, mkU32(ceil) );
+   assign( node2, binop( Iop_CmpLT32S, mkexpr(node1), mkexpr(node0) ) );
+   assign( node3, IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(node2)),
+                                mkexpr(node0),
+                                mkexpr(node1) ) );
+   assign( node4, mkU32(floor) );
+   assign( node5, binop( Iop_CmpLT32S, mkexpr(node3), mkexpr(node4) ) );
+   assign( node6, IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(node5)),
+                                mkexpr(node3),
+                                mkexpr(node4) ) );
+   assign( *res, mkexpr(node6) );
+
+   /* if saturation occurred, then resQ is set to some nonzero value
+      if sat did not occur, resQ is guaranteed to be zero. */
+   if (resQ) {
+      assign( *resQ, binop(Iop_Xor32, mkexpr(*res), mkexpr(regT)) );
+   }
+}
+
+
+/* SignedSatQ(): 'clamp' each value so it lies between  -2^N <= x <= (2^N) - 1
+   Optionally return flag resQ saying whether saturation occurred.
+   - see definition in manual, section A2.2.1, page 41
+   (bits(N), boolean ) SignedSatQ( integer i, integer N ) 
+   {
+     if ( i > 2^(N-1) - 1 )    { result = 2^(N-1) - 1; saturated = TRUE; }
+     elsif ( i < -(2^(N-1)) )  { result = -(2^(N-1));  saturated = FALSE; }
+     else                      { result = i;           saturated = FALSE; }
+     return ( result[N-1:0], saturated );
+   }
+*/
+static void armSignedSatQ( IRTemp regT,    /* value to clamp - Ity_I32 */
+                           UInt imm5,      /* saturation ceiling */
+                           IRTemp* res,    /* OUT - Ity_I32 */
+                           IRTemp* resQ )  /* OUT - Ity_I32  */
+{
+   Int ceil  =  (1 << (imm5-1)) - 1;  //  (2^(imm5-1))-1
+   Int floor = -(1 << (imm5-1));      // -(2^(imm5-1))
+
+   IRTemp node0 = newTemp(Ity_I32);
+   IRTemp node1 = newTemp(Ity_I32);
+   IRTemp node2 = newTemp(Ity_I1);
+   IRTemp node3 = newTemp(Ity_I32);
+   IRTemp node4 = newTemp(Ity_I32);
+   IRTemp node5 = newTemp(Ity_I1);
+   IRTemp node6 = newTemp(Ity_I32);
+
+   assign( node0, mkexpr(regT) );
+   assign( node1, mkU32(ceil) );
+   assign( node2, binop( Iop_CmpLT32S, mkexpr(node1), mkexpr(node0) ) );
+   assign( node3, IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(node2)),
+                                mkexpr(node0),  mkexpr(node1) ) );
+   assign( node4, mkU32(floor) );
+   assign( node5, binop( Iop_CmpLT32S, mkexpr(node3), mkexpr(node4) ) );
+   assign( node6, IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(node5)),
+                                mkexpr(node3),  mkexpr(node4) ) );
+   assign( *res, mkexpr(node6) );
+
+   /* if saturation occurred, then resQ is set to some nonzero value
+      if sat did not occur, resQ is guaranteed to be zero. */
+   if (resQ) {
+     assign( *resQ, binop(Iop_Xor32, mkexpr(*res), mkexpr(regT)) );
+   }
+}
+
+
+/* Compute a value 0 :: I32 or 1 :: I32, indicating whether signed
+   overflow occurred for 32-bit addition.  Needs both args and the
+   result.  HD p27. */
+static
+IRExpr* signed_overflow_after_Add32 ( IRExpr* resE,
+                                      IRTemp argL, IRTemp argR )
+{
+   IRTemp res = newTemp(Ity_I32);
+   assign(res, resE);
+   return
+      binop( Iop_Shr32, 
+             binop( Iop_And32,
+                    binop( Iop_Xor32, mkexpr(res), mkexpr(argL) ),
+                    binop( Iop_Xor32, mkexpr(res), mkexpr(argR) )), 
+             mkU8(31) );
+}
+
  
  /*------------------------------------------------------------*/
  /*--- Larger helpers                                       ---*/
@@ -1407,6 +1699,12 @@ static void gen_SIGILL_T_if_in_ITBlock (
  
     The calling convention for res and newC is a bit funny.  They could
     be passed by value, but instead are passed by ref.
+
+   The C (shco) value computed must be zero in bits 31:1, as the IR
+   optimisations for flag handling (guest_arm_spechelper) rely on
+   that, and the slow-path handlers (armg_calculate_flags_nzcv) assert
+   for it.  Same applies to all these functions that compute shco
+   after a shift or rotate, not just this one.
  */
  
  static void compute_result_and_C_after_LSL_by_imm5 (
@@ -1458,7 +1756,7 @@ static void compute_result_and_C_after_LSL_by_reg (
        /* mux0X(amt == 0,
                 mux0X(amt < 32, 
                       0,
-                     Rm[(32-amt) & 31])
+                     Rm[(32-amt) & 31]),
                 oldC)
        */
        /* About the best you can do is pray that iropt is able
@@ -1474,16 +1772,19 @@ static void compute_result_and_C_after_LSL_by_reg (
                 unop(Iop_1Uto8,
                      binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))),
                 mkU32(0),
-               binop(Iop_Shr32,
-                     mkexpr(rMt),
-                     unop(Iop_32to8,
-                          binop(Iop_And32,
-                                binop(Iop_Sub32,
-                                      mkU32(32),
-                                      mkexpr(amtT)),
-                                mkU32(31)
-                          )
-                     )
+               binop(Iop_And32,
+                     binop(Iop_Shr32,
+                           mkexpr(rMt),
+                           unop(Iop_32to8,
+                                binop(Iop_And32,
+                                      binop(Iop_Sub32,
+                                            mkU32(32),
+                                            mkexpr(amtT)),
+                                      mkU32(31)
+                                )
+                           )
+                     ),
+                     mkU32(1)
                 )
              ),
              mkexpr(oldC)
@@ -1569,7 +1870,7 @@ static void compute_result_and_C_after_LSR_by_reg (
        /* mux0X(amt == 0,
                 mux0X(amt < 32, 
                       0,
-                     Rm[(amt-1) & 31])
+                     Rm[(amt-1) & 31]),
                 oldC)
        */
        IRTemp oldC = newTemp(Ity_I32);
@@ -1583,16 +1884,19 @@ static void compute_result_and_C_after_LSR_by_reg (
                 unop(Iop_1Uto8,
                      binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))),
                 mkU32(0),
-               binop(Iop_Shr32,
-                     mkexpr(rMt),
-                     unop(Iop_32to8,
-                          binop(Iop_And32,
-                                binop(Iop_Sub32,
-                                      mkexpr(amtT),
-                                      mkU32(1)),
-                                mkU32(31)
-                          )
-                     )
+               binop(Iop_And32,
+                     binop(Iop_Shr32,
+                           mkexpr(rMt),
+                           unop(Iop_32to8,
+                                binop(Iop_And32,
+                                      binop(Iop_Sub32,
+                                            mkexpr(amtT),
+                                            mkU32(1)),
+                                      mkU32(31)
+                                )
+                           )
+                     ),
+                     mkU32(1)
                 )
              ),
              mkexpr(oldC)
@@ -1691,20 +1995,26 @@ static void compute_result_and_C_after_ASR_by_reg (
              IRExpr_Mux0X(
                 unop(Iop_1Uto8,
                      binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))),
-               binop(Iop_Shr32,
-                     mkexpr(rMt),
-                     mkU8(31)
+               binop(Iop_And32,
+                     binop(Iop_Shr32,
+                           mkexpr(rMt),
+                           mkU8(31)
+                     ),
+                     mkU32(1)
                 ),
-               binop(Iop_Shr32,
-                     mkexpr(rMt),
-                     unop(Iop_32to8,
-                          binop(Iop_And32,
-                                binop(Iop_Sub32,
-                                      mkexpr(amtT),
-                                      mkU32(1)),
-                                mkU32(31)
-                          )
-                     )
+               binop(Iop_And32,
+                     binop(Iop_Shr32,
+                           mkexpr(rMt),
+                           unop(Iop_32to8,
+                                binop(Iop_And32,
+                                      binop(Iop_Sub32,
+                                            mkexpr(amtT),
+                                            mkU32(1)),
+                                      mkU32(31)
+                                )
+                           )
+                     ),
+                     mkU32(1)
                 )
              ),
              mkexpr(oldC)
@@ -2373,6 +2683,108 @@ static Bool compute_ITSTATE ( /*OUT*/UInt*  itstate,
     return False;
  }
  
+
+/* Generate IR to do 32-bit bit reversal, a la Hacker's Delight
+   Chapter 7 Section 1. */
+static IRTemp gen_BITREV ( IRTemp x0 )
+{
+   IRTemp x1 = newTemp(Ity_I32);
+   IRTemp x2 = newTemp(Ity_I32);
+   IRTemp x3 = newTemp(Ity_I32);
+   IRTemp x4 = newTemp(Ity_I32);
+   IRTemp x5 = newTemp(Ity_I32);
+   UInt   c1 = 0x55555555;
+   UInt   c2 = 0x33333333;
+   UInt   c3 = 0x0F0F0F0F;
+   UInt   c4 = 0x00FF00FF;
+   UInt   c5 = 0x0000FFFF;
+   assign(x1,
+          binop(Iop_Or32,
+                binop(Iop_Shl32,
+                      binop(Iop_And32, mkexpr(x0), mkU32(c1)),
+                      mkU8(1)),
+                binop(Iop_Shr32,
+                      binop(Iop_And32, mkexpr(x0), mkU32(~c1)),
+                      mkU8(1))
+   ));
+   assign(x2,
+          binop(Iop_Or32,
+                binop(Iop_Shl32,
+                      binop(Iop_And32, mkexpr(x1), mkU32(c2)),
+                      mkU8(2)),
+                binop(Iop_Shr32,
+                      binop(Iop_And32, mkexpr(x1), mkU32(~c2)),
+                      mkU8(2))
+   ));
+   assign(x3,
+          binop(Iop_Or32,
+                binop(Iop_Shl32,
+                      binop(Iop_And32, mkexpr(x2), mkU32(c3)),
+                      mkU8(4)),
+                binop(Iop_Shr32,
+                      binop(Iop_And32, mkexpr(x2), mkU32(~c3)),
+                      mkU8(4))
+   ));
+   assign(x4,
+          binop(Iop_Or32,
+                binop(Iop_Shl32,
+                      binop(Iop_And32, mkexpr(x3), mkU32(c4)),
+                      mkU8(8)),
+                binop(Iop_Shr32,
+                      binop(Iop_And32, mkexpr(x3), mkU32(~c4)),
+                      mkU8(8))
+   ));
+   assign(x5,
+          binop(Iop_Or32,
+                binop(Iop_Shl32,
+                      binop(Iop_And32, mkexpr(x4), mkU32(c5)),
+                      mkU8(16)),
+                binop(Iop_Shr32,
+                      binop(Iop_And32, mkexpr(x4), mkU32(~c5)),
+                      mkU8(16))
+   ));
+   return x5;
+}
+
+
+/* Generate IR to do rearrange bytes 3:2:1:0 in a word in to the order
+   0:1:2:3 (aka byte-swap). */
+static IRTemp gen_REV ( IRTemp arg )
+{
+   IRTemp res = newTemp(Ity_I32);
+   assign(res, 
+          binop(Iop_Or32,
+                binop(Iop_Shl32, mkexpr(arg), mkU8(24)),
+          binop(Iop_Or32,
+                binop(Iop_And32, binop(Iop_Shl32, mkexpr(arg), mkU8(8)), 
+                                 mkU32(0x00FF0000)),
+          binop(Iop_Or32,
+                binop(Iop_And32, binop(Iop_Shr32, mkexpr(arg), mkU8(8)),
+                                       mkU32(0x0000FF00)),
+                binop(Iop_And32, binop(Iop_Shr32, mkexpr(arg), mkU8(24)),
+                                       mkU32(0x000000FF) )
+   ))));
+   return res;
+}
+
+
+/* Generate IR to do rearrange bytes 3:2:1:0 in a word in to the order
+   2:3:0:1 (swap within lo and hi halves). */
+static IRTemp gen_REV16 ( IRTemp arg )
+{
+   IRTemp res = newTemp(Ity_I32);
+   assign(res,
+          binop(Iop_Or32,
+                binop(Iop_And32,
+                      binop(Iop_Shl32, mkexpr(arg), mkU8(8)),
+                      mkU32(0xFF00FF00)),
+                binop(Iop_And32,
+                      binop(Iop_Shr32, mkexpr(arg), mkU8(8)),
+                      mkU32(0x00FF00FF))));
+   return res;
+}
+
+
  /*------------------------------------------------------------*/
  /*--- Advanced SIMD (NEON) instructions                    ---*/
  /*------------------------------------------------------------*/
@@ -3399,7 +3811,7 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
        case 5:
           if (B == 0) {
              /* VRSHL */
-            IROp op, op_shrn, op_shln, cmp_gt, op_sub, op_add;
+            IROp op, op_shrn, op_shln, cmp_gt, op_add;
              IRTemp shval, old_shval, imm_val, round;
              UInt i;
              ULong imm;
@@ -3419,28 +3831,24 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                 switch (size) {
                    case 0:
                       op = Q ? Iop_Shl8x16 : Iop_Shl8x8;
-                     op_sub = Q ? Iop_Sub8x16 : Iop_Sub8x8;
                       op_add = Q ? Iop_Add8x16 : Iop_Add8x8;
                       op_shrn = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
                       op_shln = Q ? Iop_ShlN8x16 : Iop_ShlN8x8;
                       break;
                    case 1:
                       op = Q ? Iop_Shl16x8 : Iop_Shl16x4;
-                     op_sub = Q ? Iop_Sub16x8 : Iop_Sub16x4;
                       op_add = Q ? Iop_Add16x8 : Iop_Add16x4;
                       op_shrn = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
                       op_shln = Q ? Iop_ShlN16x8 : Iop_ShlN16x4;
                       break;
                    case 2:
                       op = Q ? Iop_Shl32x4 : Iop_Shl32x2;
-                     op_sub = Q ? Iop_Sub32x4 : Iop_Sub32x2;
                       op_add = Q ? Iop_Add32x4 : Iop_Add32x2;
                       op_shrn = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
                       op_shln = Q ? Iop_ShlN32x4 : Iop_ShlN32x2;
                       break;
                    case 3:
                       op = Q ? Iop_Shl64x2 : Iop_Shl64;
-                     op_sub = Q ? Iop_Sub64x2 : Iop_Sub64;
                       op_add = Q ? Iop_Add64x2 : Iop_Add64;
                       op_shrn = Q ? Iop_ShrN64x2 : Iop_Shr64;
                       op_shln = Q ? Iop_ShlN64x2 : Iop_Shl64;
@@ -3452,28 +3860,24 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                 switch (size) {
                    case 0:
                       op = Q ? Iop_Sal8x16 : Iop_Sal8x8;
-                     op_sub = Q ? Iop_Sub8x16 : Iop_Sub8x8;
                       op_add = Q ? Iop_Add8x16 : Iop_Add8x8;
                       op_shrn = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
                       op_shln = Q ? Iop_ShlN8x16 : Iop_ShlN8x8;
                       break;
                    case 1:
                       op = Q ? Iop_Sal16x8 : Iop_Sal16x4;
-                     op_sub = Q ? Iop_Sub16x8 : Iop_Sub16x4;
                       op_add = Q ? Iop_Add16x8 : Iop_Add16x4;
                       op_shrn = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
                       op_shln = Q ? Iop_ShlN16x8 : Iop_ShlN16x4;
                       break;
                    case 2:
                       op = Q ? Iop_Sal32x4 : Iop_Sal32x2;
-                     op_sub = Q ? Iop_Sub32x4 : Iop_Sub32x2;
                       op_add = Q ? Iop_Add32x4 : Iop_Add32x2;
                       op_shrn = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
                       op_shln = Q ? Iop_ShlN32x4 : Iop_ShlN32x2;
                       break;
                    case 3:
                       op = Q ? Iop_Sal64x2 : Iop_Sal64x1;
-                     op_sub = Q ? Iop_Sub64x2 : Iop_Sub64;
                       op_add = Q ? Iop_Add64x2 : Iop_Add64;
                       op_shrn = Q ? Iop_ShrN64x2 : Iop_Shr64;
                       op_shln = Q ? Iop_ShlN64x2 : Iop_Shl64;
@@ -3509,7 +3913,7 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                                     binop(op,
                                           mkexpr(arg_m),
                                           unop(Iop_64to8,
-                                              binop(op_sub,
+                                              binop(op_add,
                                                      mkexpr(arg_n),
                                                      mkexpr(imm_val)))),
                                     binop(Q ? Iop_AndV128 : Iop_And64,
@@ -3544,7 +3948,7 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                  nreg);
           } else {
              /* VQRSHL */
-            IROp op, op_rev, op_shrn, op_shln, cmp_neq, cmp_gt, op_sub, op_add;
+            IROp op, op_rev, op_shrn, op_shln, cmp_neq, cmp_gt, op_add;
              IRTemp tmp, shval, mask, old_shval, imm_val, round;
              UInt i;
              ULong esize, imm;
@@ -3565,7 +3969,6 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                 switch (size) {
                    case 0:
                       op = Q ? Iop_QShl8x16 : Iop_QShl8x8;
-                     op_sub = Q ? Iop_Sub8x16 : Iop_Sub8x8;
                       op_add = Q ? Iop_Add8x16 : Iop_Add8x8;
                       op_rev = Q ? Iop_Shr8x16 : Iop_Shr8x8;
                       op_shrn = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
@@ -3573,7 +3976,6 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                       break;
                    case 1:
                       op = Q ? Iop_QShl16x8 : Iop_QShl16x4;
-                     op_sub = Q ? Iop_Sub16x8 : Iop_Sub16x4;
                       op_add = Q ? Iop_Add16x8 : Iop_Add16x4;
                       op_rev = Q ? Iop_Shr16x8 : Iop_Shr16x4;
                       op_shrn = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
@@ -3581,7 +3983,6 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                       break;
                    case 2:
                       op = Q ? Iop_QShl32x4 : Iop_QShl32x2;
-                     op_sub = Q ? Iop_Sub32x4 : Iop_Sub32x2;
                       op_add = Q ? Iop_Add32x4 : Iop_Add32x2;
                       op_rev = Q ? Iop_Shr32x4 : Iop_Shr32x2;
                       op_shrn = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
@@ -3589,7 +3990,6 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                       break;
                    case 3:
                       op = Q ? Iop_QShl64x2 : Iop_QShl64x1;
-                     op_sub = Q ? Iop_Sub64x2 : Iop_Sub64;
                       op_add = Q ? Iop_Add64x2 : Iop_Add64;
                       op_rev = Q ? Iop_Shr64x2 : Iop_Shr64;
                       op_shrn = Q ? Iop_ShrN64x2 : Iop_Shr64;
@@ -3602,7 +4002,6 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                 switch (size) {
                    case 0:
                       op = Q ? Iop_QSal8x16 : Iop_QSal8x8;
-                     op_sub = Q ? Iop_Sub8x16 : Iop_Sub8x8;
                       op_add = Q ? Iop_Add8x16 : Iop_Add8x8;
                       op_rev = Q ? Iop_Sar8x16 : Iop_Sar8x8;
                       op_shrn = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
@@ -3610,7 +4009,6 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                       break;
                    case 1:
                       op = Q ? Iop_QSal16x8 : Iop_QSal16x4;
-                     op_sub = Q ? Iop_Sub16x8 : Iop_Sub16x4;
                       op_add = Q ? Iop_Add16x8 : Iop_Add16x4;
                       op_rev = Q ? Iop_Sar16x8 : Iop_Sar16x4;
                       op_shrn = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
@@ -3618,7 +4016,6 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                       break;
                    case 2:
                       op = Q ? Iop_QSal32x4 : Iop_QSal32x2;
-                     op_sub = Q ? Iop_Sub32x4 : Iop_Sub32x2;
                       op_add = Q ? Iop_Add32x4 : Iop_Add32x2;
                       op_rev = Q ? Iop_Sar32x4 : Iop_Sar32x2;
                       op_shrn = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
@@ -3626,7 +4023,6 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                       break;
                    case 3:
                       op = Q ? Iop_QSal64x2 : Iop_QSal64x1;
-                     op_sub = Q ? Iop_Sub64x2 : Iop_Sub64;
                       op_add = Q ? Iop_Add64x2 : Iop_Add64;
                       op_rev = Q ? Iop_Sar64x2 : Iop_Sar64;
                       op_shrn = Q ? Iop_ShrN64x2 : Iop_Shr64;
@@ -3751,9 +4147,9 @@ Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
                 }
              } else {
                 switch (size) {
-                  case 0: op = Q ? Iop_Min8Sx16 : Iop_Min8Sx8; break;
-                  case 1: op = Q ? Iop_Min16Sx8 : Iop_Min16Sx4; break;
-                  case 2: op = Q ? Iop_Min32Sx4 : Iop_Min32Sx2; break;
+                  case 0: op = Q ? Iop_Min8Ux16 : Iop_Min8Ux8; break;
+                  case 1: op = Q ? Iop_Min16Ux8 : Iop_Min16Ux4; break;
+                  case 2: op = Q ? Iop_Min32Ux4 : Iop_Min32Ux2; break;
                    case 3: return False;
                    default: vassert(0);
                 }
@@ -5202,11 +5598,10 @@ Bool dis_neon_data_2reg_and_scalar ( UInt theInstr, IRTemp condT )
     if (INSN(11,8) == BITS4(1,0,1,1) && !U) {
        IROp op ,op2, dup, get;
        ULong imm;
-      IRTemp res, arg_m, arg_n;
+      IRTemp arg_m, arg_n;
        if (dreg & 1)
           return False;
        dreg >>= 1;
-      res = newTemp(Ity_V128);
        arg_m = newTemp(Ity_I64);
        arg_n = newTemp(Ity_I64);
        assign(arg_n, getDRegI64(nreg));
@@ -6817,8 +7212,8 @@ Bool dis_neon_data_2reg_misc ( UInt theInstr, IRTemp condT )
                       op_hi = Iop_InterleaveEvenLanes16x4;
                       break;
                    case 2:
-                     op_lo = Iop_InterleaveHI32x2;
-                     op_hi = Iop_InterleaveLO32x2;
+                     op_lo = Iop_InterleaveLO32x2;
+                     op_hi = Iop_InterleaveHI32x2;
                       break;
                    case 3:
                       return False;
@@ -6912,7 +7307,7 @@ Bool dis_neon_data_2reg_misc ( UInt theInstr, IRTemp condT )
              }
              switch (size) {
                 case 0:
-                  op_lo = Q ? Iop_InterleaveLO8x16 : Iop_InterleaveHI8x8;
+                  op_lo = Q ? Iop_InterleaveHI8x16 : Iop_InterleaveHI8x8;
                    op_hi = Q ? Iop_InterleaveLO8x16 : Iop_InterleaveLO8x8;
                    break;
                 case 1:
@@ -7529,7 +7924,6 @@ Bool dis_neon_elem_or_struct_load ( UInt theInstr,
     UInt N, size, i, j;
     UInt inc;
     UInt regs = 1;
-   IRTemp addr;
  
     if (isT) {
        vassert(condT != IRTemp_INVALID);
@@ -7542,6 +7936,12 @@ Bool dis_neon_elem_or_struct_load ( UInt theInstr,
     if (INSN(20,20) != 0)
        return False;
  
+   IRTemp initialRn = newTemp(Ity_I32);
+   assign(initialRn, isT ? getIRegT(rN) : getIRegA(rN));
+
+   IRTemp initialRm = newTemp(Ity_I32);
+   assign(initialRm, isT ? getIRegT(rM) : getIRegA(rM));
+
     if (A) {
        N = B & 3;
        if ((B >> 2) < 3) {
@@ -7557,8 +7957,8 @@ Bool dis_neon_elem_or_struct_load ( UInt theInstr,
              default: vassert(0);
           }
  
-         addr = newTemp(Ity_I32);
-         assign(addr, isT ? getIRegT(rN) : getIRegA(rN));
+         IRTemp addr = newTemp(Ity_I32);
+         assign(addr, mkexpr(initialRn));
  
           // go uncond
           if (condT != IRTemp_INVALID)
@@ -7575,7 +7975,12 @@ Bool dis_neon_elem_or_struct_load ( UInt theInstr,
                 DIP(", ");
              DIP("d%u[%u]", rD + j * inc, i);
           }
-            DIP("}, [r%u]%s\n", rN, (rM != 15) ? "!" : "");
+         DIP("}, [r%u]", rN);
+         if (rM != 13 && rM != 15) {
+            DIP(", r%u\n", rM);
+         } else {
+            DIP("%s\n", (rM != 15) ? "!" : "");
+         }
        } else {
           /* VLDn (single element to all lanes) */
           UInt r;
@@ -7601,8 +8006,8 @@ Bool dis_neon_elem_or_struct_load ( UInt theInstr,
              mk_skip_over_T32_if_cond_is_false(condT);
           // now uncond
  
-         addr = newTemp(Ity_I32);
-         assign(addr, isT ? getIRegT(rN) : getIRegA(rN));
+         IRTemp addr = newTemp(Ity_I32);
+         assign(addr, mkexpr(initialRn));
  
           if (N == 0 && INSN(5,5))
              regs = 2;
@@ -7666,21 +8071,27 @@ Bool dis_neon_elem_or_struct_load ( UInt theInstr,
                 DIP("d%u[]", rD + r + i * inc);
              }
           }
-         DIP("}, [r%u]%s\n", rN, (rM != 15) ? "!" : "");
+         DIP("}, [r%u]", rN);
+         if (rM != 13 && rM != 15) {
+            DIP(", r%u\n", rM);
+         } else {
+            DIP("%s\n", (rM != 15) ? "!" : "");
+         }
        }
        /* Writeback.  We're uncond here, so no condT-ing. */
        if (rM != 15) {
           if (rM == 13) {
              IRExpr* e = binop(Iop_Add32,
-                              mkexpr(addr),
+                              mkexpr(initialRn),
                                mkU32((1 << size) * (N + 1)));
              if (isT)
                 putIRegT(rN, e, IRTemp_INVALID);
              else
                 putIRegA(rN, e, IRTemp_INVALID, Ijk_Boring);
           } else {
-            IRExpr* e = binop(Iop_Add32, mkexpr(addr),
-                              isT ? getIRegT(rM) : getIRegA(rM));
+            IRExpr* e = binop(Iop_Add32,
+                              mkexpr(initialRn),
+                              mkexpr(initialRm));
              if (isT)
                 putIRegT(rN, e, IRTemp_INVALID);
              else
@@ -7731,8 +8142,8 @@ Bool dis_neon_elem_or_struct_load ( UInt theInstr,
           mk_skip_over_T32_if_cond_is_false(condT);
        // now uncond
  
-      addr = newTemp(Ity_I32);
-      assign(addr, isT ? getIRegT(rN) : getIRegA(rN));
+      IRTemp addr = newTemp(Ity_I32);
+      assign(addr, mkexpr(initialRn));
  
        for (r = 0; r < regs; r++) {
           for (i = 0; i < elems; i++) {
@@ -7750,15 +8161,16 @@ Bool dis_neon_elem_or_struct_load ( UInt theInstr,
        if (rM != 15) {
           if (rM == 13) {
              IRExpr* e = binop(Iop_Add32,
-                              mkexpr(addr),
+                              mkexpr(initialRn),
                                mkU32(8 * (N + 1) * regs));
              if (isT)
                 putIRegT(rN, e, IRTemp_INVALID);
              else
                 putIRegA(rN, e, IRTemp_INVALID, Ijk_Boring);
           } else {
-            IRExpr* e = binop(Iop_Add32, mkexpr(addr),
-                              isT ? getIRegT(rM) : getIRegA(rM));
+            IRExpr* e = binop(Iop_Add32,
+                              mkexpr(initialRn),
+                              mkexpr(initialRm));
              if (isT)
                 putIRegT(rN, e, IRTemp_INVALID);
              else
@@ -7778,7 +8190,12 @@ Bool dis_neon_elem_or_struct_load ( UInt theInstr,
              }
           }
        }
-      DIP("}, [r%u]%s\n", rN, (rM != 15) ? "!" : "");
+      DIP("}, [r%u]", rN);
+      if (rM != 13 && rM != 15) {
+         DIP(", r%u\n", rM);
+      } else {
+         DIP("%s\n", (rM != 15) ? "!" : "");
+      }
        return True;
     }
  #  undef INSN
@@ -7868,100 +8285,2008 @@ static Bool decode_NEON_instruction (
  
  
  /*------------------------------------------------------------*/
-/*--- LDMxx/STMxx helper (both ARM and Thumb32)            ---*/
+/*--- V6 MEDIA instructions                                ---*/
  /*------------------------------------------------------------*/
  
-/* Generate IR for LDMxx and STMxx.  This is complex.  Assumes it's
-   unconditional, so the caller must produce a jump-around before
-   calling this, if the insn is to be conditional.  Caller is
-   responsible for all validation of parameters.  For LDMxx, if PC is
-   amongst the values loaded, caller is also responsible for
-   generating the jump. */
-static void mk_ldm_stm ( Bool arm,     /* True: ARM, False: Thumb */
-                         UInt rN,      /* base reg */
-                         UInt bINC,    /* 1: inc,  0: dec */
-                         UInt bBEFORE, /* 1: inc/dec before, 0: after */
-                         UInt bW,      /* 1: writeback to Rn */
-                         UInt bL,      /* 1: load, 0: store */
-                         UInt regList )
-{
-   Int i, r, m, nRegs;
+/* Both ARM and Thumb */
  
-   /* Get hold of the old Rn value.  We might need to write its value
-      to memory during a store, and if it's also the writeback
-      register then we need to get its value now.  We can't treat it
-      exactly like the other registers we're going to transfer,
-      because for xxMDA and xxMDB writeback forms, the generated IR
-      updates Rn in the guest state before any transfers take place.
-      We have to do this as per comments below, in order that if Rn is
-      the stack pointer then it always has a value is below or equal
-      to any of the transfer addresses.  Ick. */
-   IRTemp oldRnT = newTemp(Ity_I32);
-   assign(oldRnT, arm ? getIRegA(rN) : getIRegT(rN));
+/* Translate a V6 media instruction.    If successful, returns
+   True and *dres may or may not be updated.  If failure, returns
+   False and doesn't change *dres nor create any IR.
  
-   IRTemp anchorT = newTemp(Ity_I32);
-   /* The old (Addison-Wesley) ARM ARM seems to say that LDMxx/STMxx
-      ignore the bottom two bits of the address.  However, Cortex-A8
-      doesn't seem to care.  Hence: */
-   /* No .. don't force alignment .. */
-   /* assign(anchorT, binop(Iop_And32, mkexpr(oldRnT), mkU32(~3U))); */
-   /* Instead, use the potentially misaligned address directly. */
-   assign(anchorT, mkexpr(oldRnT));
+   The Thumb and ARM encodings are completely different.  In Thumb
+   mode, the caller must pass the entire 32 bits.  In ARM mode it must
+   pass the lower 28 bits.  Apart from that, callers may pass any
+   instruction; this function ignores anything it doesn't recognise.
  
-   IROp opADDorSUB = bINC ? Iop_Add32 : Iop_Sub32;
-   // bINC == 1:  xxMIA, xxMIB
-   // bINC == 0:  xxMDA, xxMDB
+   Caller must supply an IRTemp 'condT' holding the gating condition,
+   or IRTemp_INVALID indicating the insn is always executed.
  
-   // For xxMDA and xxMDB, update Rn first if necessary.  We have
-   // to do this first so that, for the common idiom of the transfers
-   // faulting because we're pushing stuff onto a stack and the stack
-   // is growing down onto allocate-on-fault pages (as Valgrind simulates),
-   // we need to have the SP up-to-date "covering" (pointing below) the
-   // transfer area.  For the same reason, if we are doing xxMIA or xxMIB,
-   // do the transfer first, and then update rN afterwards.
-   nRegs = 0;
-   for (i = 0; i < 16; i++) {
-     if ((regList & (1 << i)) != 0)
-         nRegs++;
-   }
-   if (bW == 1 && !bINC) {
-      IRExpr* e = binop(opADDorSUB, mkexpr(oldRnT), mkU32(4*nRegs));
-      if (arm)
-         putIRegA( rN, e, IRTemp_INVALID, Ijk_Boring );
-      else
-         putIRegT( rN, e, IRTemp_INVALID );
+   Caller must also supply an ARMCondcode 'cond'.  This is only used
+   for debug printing, no other purpose.  For ARM, this is simply the
+   top 4 bits of the original instruction.  For Thumb, the condition
+   is not (really) known until run time, and so ARMCondAL should be
+   passed, only so that printing of these instructions does not show
+   any condition.
+
+   Finally, the caller must indicate whether this occurs in ARM or in
+   Thumb code.
+*/
+static Bool decode_V6MEDIA_instruction (
+               /*MOD*/DisResult* dres,
+               UInt              insnv6m,
+               IRTemp            condT,
+               ARMCondcode       conq,
+               Bool              isT
+            )
+{
+#  define INSNA(_bMax,_bMin)   SLICE_UInt(insnv6m, (_bMax), (_bMin))
+#  define INSNT0(_bMax,_bMin)  SLICE_UInt( ((insnv6m >> 16) & 0xFFFF), \
+                                           (_bMax), (_bMin) )
+#  define INSNT1(_bMax,_bMin)  SLICE_UInt( ((insnv6m >> 0)  & 0xFFFF), \
+                                           (_bMax), (_bMin) )
+   HChar dis_buf[128];
+   dis_buf[0] = 0;
+
+   if (isT) {
+      vassert(conq == ARMCondAL);
+   } else {
+      vassert(INSNA(31,28) == BITS4(0,0,0,0)); // caller's obligation
+      vassert(conq >= ARMCondEQ && conq <= ARMCondAL);
     }
  
-   // Make up a list of the registers to transfer, and their offsets
-   // in memory relative to the anchor.  If the base reg (Rn) is part
-   // of the transfer, then do it last for a load and first for a store.
-   UInt xReg[16], xOff[16];
-   Int  nX = 0;
-   m = 0;
-   for (i = 0; i < 16; i++) {
-      r = bINC ? i : (15-i);
-      if (0 == (regList & (1<<r)))
-         continue;
-      if (bBEFORE)
-         m++;
-      /* paranoia: check we aren't transferring the writeback
-         register during a load. Should be assured by decode-point
-         check above. */
-      if (bW == 1 && bL == 1)
-         vassert(r != rN);
+   /* ----------- smulbb, smulbt, smultb, smultt ----------- */
+   {
+     UInt regD = 99, regM = 99, regN = 99, bitM = 0, bitN = 0;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFB1 && INSNT1(15,12) == BITS4(1,1,1,1)
+            && INSNT1(7,6) == BITS2(0,0)) {
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           regN = INSNT0(3,0);
+           bitM = INSNT1(4,4);
+           bitN = INSNT1(5,5);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (BITS8(0,0,0,1,0,1,1,0) == INSNA(27,20) &&
+            BITS4(0,0,0,0)         == INSNA(15,12) &&
+            BITS4(1,0,0,0)         == (INSNA(7,4) & BITS4(1,0,0,1)) ) {
+           regD = INSNA(19,16);
+           regM = INSNA(11,8);
+           regN = INSNA(3,0);
+           bitM = INSNA(6,6);
+           bitN = INSNA(5,5);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
  
-      xOff[nX] = 4 * m;
-      xReg[nX] = r;
-      nX++;
+     if (gate) {
+        IRTemp srcN = newTemp(Ity_I32);
+        IRTemp srcM = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+
+        assign( srcN, binop(Iop_Sar32,
+                            binop(Iop_Shl32,
+                                  isT ? getIRegT(regN) : getIRegA(regN),
+                                  mkU8(bitN ? 0 : 16)), mkU8(16)) );
+        assign( srcM, binop(Iop_Sar32,
+                            binop(Iop_Shl32,
+                                  isT ? getIRegT(regM) : getIRegA(regM),
+                                  mkU8(bitM ? 0 : 16)), mkU8(16)) );
+        assign( res, binop(Iop_Mul32, mkexpr(srcN), mkexpr(srcM)) );
+
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        DIP( "smul%c%c%s r%u, r%u, r%u\n", bitN ? 't' : 'b', bitM ? 't' : 'b',
+             nCC(conq), regD, regN, regM );
+        return True;
+     }
+     /* fall through */
+   }
  
-      if (!bBEFORE)
-         m++;
+   /* ------------ smulwb<y><c> <Rd>,<Rn>,<Rm> ------------- */
+   /* ------------ smulwt<y><c> <Rd>,<Rn>,<Rm> ------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99, bitM = 0;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFB3 && INSNT1(15,12) == BITS4(1,1,1,1)
+            && INSNT1(7,5) == BITS3(0,0,0)) {
+          regN = INSNT0(3,0);
+          regD = INSNT1(11,8);
+          regM = INSNT1(3,0);
+          bitM = INSNT1(4,4);
+          if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+             gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,0,0,1,0,0,1,0) && 
+            INSNA(15,12) == BITS4(0,0,0,0)         &&
+            (INSNA(7,4) & BITS4(1,0,1,1)) == BITS4(1,0,1,0)) {
+           regD = INSNA(19,16);
+           regN = INSNA(3,0);
+           regM = INSNA(11,8);
+           bitM = INSNA(6,6);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_prod = newTemp(Ity_I64);
+
+        assign( irt_prod, 
+                binop(Iop_MullS32,
+                      isT ? getIRegT(regN) : getIRegA(regN),
+                      binop(Iop_Sar32, 
+                            binop(Iop_Shl32,
+                                  isT ? getIRegT(regM) : getIRegA(regM),
+                                  mkU8(bitM ? 0 : 16)), 
+                            mkU8(16))) );
+
+        IRExpr* ire_result = binop(Iop_Or32, 
+                                   binop( Iop_Shl32, 
+                                          unop(Iop_64HIto32, mkexpr(irt_prod)), 
+                                          mkU8(16) ), 
+                                   binop( Iop_Shr32, 
+                                          unop(Iop_64to32, mkexpr(irt_prod)), 
+                                          mkU8(16) ) );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP("smulw%c%s r%u, r%u, r%u\n",
+            bitM ? 't' : 'b', nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
     }
-   vassert(m == nRegs);
-   vassert(nX == nRegs);
-   vassert(nX <= 16);
  
-   if (bW == 0 && (regList & (1<<rN)) != 0) {
+   /* ------------ pkhbt<c> Rd, Rn, Rm {,LSL #imm} ------------- */
+   /* ------------ pkhtb<c> Rd, Rn, Rm {,ASR #imm} ------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99, imm5 = 99, shift_type = 99;
+     Bool tbform = False;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xEAC 
+            && INSNT1(15,15) == 0 && INSNT1(4,4) == 0) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           imm5 = (INSNT1(14,12) << 2) | INSNT1(7,6);
+           shift_type = (INSNT1(5,5) << 1) | 0;
+           tbform = (INSNT1(5,5) == 0) ? False : True;
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,1,0,0,0) &&
+            INSNA(5,4)   == BITS2(0,1)             &&
+            (INSNA(6,6)  == 0 || INSNA(6,6) == 1) ) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           imm5 = INSNA(11,7);
+           shift_type = (INSNA(6,6) << 1) | 0;
+           tbform = (INSNA(6,6) == 0) ? False : True;
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regM       = newTemp(Ity_I32);
+        IRTemp irt_regM_shift = newTemp(Ity_I32);
+        assign( irt_regM, isT ? getIRegT(regM) : getIRegA(regM) );
+        compute_result_and_C_after_shift_by_imm5(
+           dis_buf, &irt_regM_shift, NULL, irt_regM, shift_type, imm5, regM );
+
+        UInt mask = (tbform == True) ? 0x0000FFFF : 0xFFFF0000;
+        IRExpr* ire_result 
+          = binop( Iop_Or32, 
+                   binop(Iop_And32, mkexpr(irt_regM_shift), mkU32(mask)), 
+                   binop(Iop_And32, isT ? getIRegT(regN) : getIRegA(regN),
+                                    unop(Iop_Not32, mkU32(mask))) );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP( "pkh%s%s r%u, r%u, r%u %s\n", tbform ? "tb" : "bt", 
+             nCC(conq), regD, regN, regM, dis_buf );
+
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ---------- usat<c> <Rd>,#<imm5>,<Rn>{,<shift>} ----------- */
+   {
+     UInt regD = 99, regN = 99, shift_type = 99, imm5 = 99, sat_imm = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,6) == BITS10(1,1,1,1,0,0,1,1,1,0)
+            && INSNT0(4,4) == 0
+            && INSNT1(15,15) == 0 && INSNT1(5,5) == 0) {
+           regD       = INSNT1(11,8);
+           regN       = INSNT0(3,0);
+           shift_type = (INSNT0(5,5) << 1) | 0;
+           imm5       = (INSNT1(14,12) << 2) | INSNT1(7,6);
+           sat_imm    = INSNT1(4,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN))
+              gate = True;
+           if (shift_type == BITS2(1,0) && imm5 == 0)
+              gate = False;
+        }
+     } else {
+        if (INSNA(27,21) == BITS7(0,1,1,0,1,1,1) &&
+            INSNA(5,4)   == BITS2(0,1)) {
+           regD       = INSNA(15,12);
+           regN       = INSNA(3,0);
+           shift_type = (INSNA(6,6) << 1) | 0;
+           imm5       = INSNA(11,7);
+           sat_imm    = INSNA(20,16);
+           if (regD != 15 && regN != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN       = newTemp(Ity_I32);
+        IRTemp irt_regN_shift = newTemp(Ity_I32);
+        IRTemp irt_sat_Q      = newTemp(Ity_I32);
+        IRTemp irt_result     = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        compute_result_and_C_after_shift_by_imm5(
+                dis_buf, &irt_regN_shift, NULL,
+                irt_regN, shift_type, imm5, regN );
+
+        armUnsignedSatQ( &irt_result, &irt_sat_Q, irt_regN_shift, sat_imm );
+        or_into_QFLAG32( mkexpr(irt_sat_Q), condT );
+
+        if (isT)
+           putIRegT( regD, mkexpr(irt_result), condT );
+        else
+           putIRegA( regD, mkexpr(irt_result), condT, Ijk_Boring );
+
+        DIP("usat%s r%u, #0x%04x, %s\n",
+            nCC(conq), regD, imm5, dis_buf);
+        return True;
+     }
+     /* fall through */
+   }
+
+  /* ----------- ssat<c> <Rd>,#<imm5>,<Rn>{,<shift>} ----------- */
+   {
+     UInt regD = 99, regN = 99, shift_type = 99, imm5 = 99, sat_imm = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,6) == BITS10(1,1,1,1,0,0,1,1,0,0)
+            && INSNT0(4,4) == 0
+            && INSNT1(15,15) == 0 && INSNT1(5,5) == 0) {
+           regD       = INSNT1(11,8);
+           regN       = INSNT0(3,0);
+           shift_type = (INSNT0(5,5) << 1) | 0;
+           imm5       = (INSNT1(14,12) << 2) | INSNT1(7,6);
+           sat_imm    = INSNT1(4,0) + 1;
+           if (!isBadRegT(regD) && !isBadRegT(regN))
+              gate = True;
+           if (shift_type == BITS2(1,0) && imm5 == 0)
+              gate = False;
+        }
+     } else {
+        if (INSNA(27,21) == BITS7(0,1,1,0,1,0,1) &&
+            INSNA(5,4)   == BITS2(0,1)) {
+           regD       = INSNA(15,12);
+           regN       = INSNA(3,0);
+           shift_type = (INSNA(6,6) << 1) | 0;
+           imm5       = INSNA(11,7);
+           sat_imm    = INSNA(20,16) + 1;
+           if (regD != 15 && regN != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN       = newTemp(Ity_I32);
+        IRTemp irt_regN_shift = newTemp(Ity_I32);
+        IRTemp irt_sat_Q      = newTemp(Ity_I32);
+        IRTemp irt_result     = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        compute_result_and_C_after_shift_by_imm5(
+                dis_buf, &irt_regN_shift, NULL,
+                irt_regN, shift_type, imm5, regN );
+
+        armSignedSatQ( irt_regN_shift, sat_imm, &irt_result, &irt_sat_Q );
+        or_into_QFLAG32( mkexpr(irt_sat_Q), condT );
+
+        if (isT)
+           putIRegT( regD, mkexpr(irt_result), condT );
+        else
+           putIRegA( regD, mkexpr(irt_result), condT, Ijk_Boring );
+
+        DIP( "ssat%s r%u, #0x%04x, %s\n",
+             nCC(conq), regD, imm5, dis_buf);
+        return True;
+    }
+    /* fall through */
+  }
+
+   /* -------------- usat16<c> <Rd>,#<imm4>,<Rn> --------------- */
+   {
+     UInt regD = 99, regN = 99, sat_imm = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xF3A && (INSNT1(15,0) & 0xF0F0) == 0x0000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           sat_imm = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN))
+              gate = True;
+       }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,1,1,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,0,1,1)) {
+           regD    = INSNA(15,12);
+           regN    = INSNA(3,0);
+           sat_imm = INSNA(19,16);
+           if (regD != 15 && regN != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN    = newTemp(Ity_I32);
+        IRTemp irt_regN_lo = newTemp(Ity_I32);
+        IRTemp irt_regN_hi = newTemp(Ity_I32);
+        IRTemp irt_Q_lo    = newTemp(Ity_I32);
+        IRTemp irt_Q_hi    = newTemp(Ity_I32);
+        IRTemp irt_res_lo  = newTemp(Ity_I32);
+        IRTemp irt_res_hi  = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( irt_regN_lo, binop( Iop_Sar32, 
+                                    binop(Iop_Shl32, mkexpr(irt_regN), mkU8(16)), 
+                                    mkU8(16)) );
+        assign( irt_regN_hi, binop(Iop_Sar32, mkexpr(irt_regN), mkU8(16)) );
+
+        armUnsignedSatQ( &irt_res_lo, &irt_Q_lo, irt_regN_lo, sat_imm );
+        or_into_QFLAG32( mkexpr(irt_Q_lo), condT );
+
+        armUnsignedSatQ( &irt_res_hi, &irt_Q_hi, irt_regN_hi, sat_imm );
+        or_into_QFLAG32( mkexpr(irt_Q_hi), condT );
+
+        IRExpr* ire_result = binop( Iop_Or32, 
+                                    binop(Iop_Shl32, mkexpr(irt_res_hi), mkU8(16)),
+                                    mkexpr(irt_res_lo) );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP( "usat16%s r%u, #0x%04x, r%u\n", nCC(conq), regD, sat_imm, regN );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* -------------- uadd16<c> <Rd>,<Rn>,<Rm> -------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA9 && (INSNT1(15,0) & 0xF0F0) == 0xF040) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,0,1) && 
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,0,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Add16x2, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, binop(Iop_HAdd16Ux2, mkexpr(rNt), mkexpr(rMt)));
+        set_GE_32_10_from_bits_31_15(reso, condT);
+
+        DIP("uadd16%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* -------------- sadd16<c> <Rd>,<Rn>,<Rm> -------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA9 && (INSNT1(15,0) & 0xF0F0) == 0xF000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,0,1) && 
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,0,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Add16x2, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, unop(Iop_Not32,
+                          binop(Iop_HAdd16Sx2, mkexpr(rNt), mkexpr(rMt))));
+        set_GE_32_10_from_bits_31_15(reso, condT);
+
+        DIP("sadd16%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ---------------- usub16<c> <Rd>,<Rn>,<Rm> ---------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAD && (INSNT1(15,0) & 0xF0F0) == 0xF040) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,0,1) && 
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,1,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+             gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Sub16x2, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, unop(Iop_Not32,
+                          binop(Iop_HSub16Ux2, mkexpr(rNt), mkexpr(rMt))));
+        set_GE_32_10_from_bits_31_15(reso, condT);
+
+        DIP("usub16%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* -------------- ssub16<c> <Rd>,<Rn>,<Rm> -------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAD && (INSNT1(15,0) & 0xF0F0) == 0xF000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,0,1) && 
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,1,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Sub16x2, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, unop(Iop_Not32,
+                          binop(Iop_HSub16Sx2, mkexpr(rNt), mkexpr(rMt))));
+        set_GE_32_10_from_bits_31_15(reso, condT);
+
+        DIP("ssub16%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ----------------- uadd8<c> <Rd>,<Rn>,<Rm> ---------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA8 && (INSNT1(15,0) & 0xF0F0) == 0xF040) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,0,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            (INSNA(7,4)  == BITS4(1,0,0,1))) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Add8x4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, binop(Iop_HAdd8Ux4, mkexpr(rNt), mkexpr(rMt)));
+        set_GE_3_2_1_0_from_bits_31_23_15_7(reso, condT);
+
+        DIP("uadd8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------- sadd8<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA8 && (INSNT1(15,0) & 0xF0F0) == 0xF000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,0,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            (INSNA(7,4)  == BITS4(1,0,0,1))) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Add8x4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, unop(Iop_Not32,
+                          binop(Iop_HAdd8Sx4, mkexpr(rNt), mkexpr(rMt))));
+        set_GE_3_2_1_0_from_bits_31_23_15_7(reso, condT);
+
+        DIP("sadd8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------- usub8<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAC && (INSNT1(15,0) & 0xF0F0) == 0xF040) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,0,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            (INSNA(7,4)  == BITS4(1,1,1,1))) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+             gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Sub8x4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, unop(Iop_Not32,
+                          binop(Iop_HSub8Ux4, mkexpr(rNt), mkexpr(rMt))));
+        set_GE_3_2_1_0_from_bits_31_23_15_7(reso, condT);
+
+        DIP("usub8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------- ssub8<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAC && (INSNT1(15,0) & 0xF0F0) == 0xF000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,0,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(1,1,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Sub8x4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, unop(Iop_Not32,
+                          binop(Iop_HSub8Sx4, mkexpr(rNt), mkexpr(rMt))));
+        set_GE_3_2_1_0_from_bits_31_23_15_7(reso, condT);
+
+        DIP("ssub8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------ qadd8<c> <Rd>,<Rn>,<Rm> ------------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA8 && (INSNT1(15,0) & 0xF0F0) == 0xF010) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(1,0,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_QAdd8Sx4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("qadd8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------ qsub8<c> <Rd>,<Rn>,<Rm> ------------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAC && (INSNT1(15,0) & 0xF0F0) == 0xF010) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(1,1,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_QSub8Sx4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("qsub8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------ uqadd8<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA8 && (INSNT1(15,0) & 0xF0F0) == 0xF050) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            (INSNA(7,4)  == BITS4(1,0,0,1))) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_QAdd8Ux4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("uqadd8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------ uqsub8<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAC && (INSNT1(15,0) & 0xF0F0) == 0xF050) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            (INSNA(7,4)  == BITS4(1,1,1,1))) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+             gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_QSub8Ux4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("uqsub8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ----------------- uhadd8<c> <Rd>,<Rn>,<Rm> ------------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA8 && (INSNT1(15,0) & 0xF0F0) == 0xF060) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,1,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(1,0,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_HAdd8Ux4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("uhadd8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ----------------- shadd8<c> <Rd>,<Rn>,<Rm> ------------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA8 && (INSNT1(15,0) & 0xF0F0) == 0xF020) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(1,0,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_HAdd8Sx4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("shadd8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------ qadd16<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA9 && (INSNT1(15,0) & 0xF0F0) == 0xF010) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,0,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_QAdd16Sx2, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("qadd16%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------ qsub16<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+      if (isT) {
+        if (INSNT0(15,4) == 0xFAD && (INSNT1(15,0) & 0xF0F0) == 0xF010) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,1,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+             gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_QSub16Sx2, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("qsub16%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /////////////////////////////////////////////////////////////////
+   /////////////////////////////////////////////////////////////////
+   /////////////////////////////////////////////////////////////////
+   /////////////////////////////////////////////////////////////////
+   /////////////////////////////////////////////////////////////////
+
+   /* ------------------- qsax<c> <Rd>,<Rn>,<Rm> ------------------- */
+   /* note: the hardware seems to construct the result differently
+      from wot the manual says. */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAE && (INSNT1(15,0) & 0xF0F0) == 0xF010) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,1,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN     = newTemp(Ity_I32);
+        IRTemp irt_regM     = newTemp(Ity_I32);
+        IRTemp irt_sum      = newTemp(Ity_I32);
+        IRTemp irt_diff     = newTemp(Ity_I32);
+        IRTemp irt_sum_res  = newTemp(Ity_I32);
+        IRTemp irt_diff_res = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( irt_regM, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign( irt_diff, 
+                binop( Iop_Sub32, 
+                       binop( Iop_Sar32, mkexpr(irt_regN), mkU8(16) ),
+                       binop( Iop_Sar32, 
+                              binop(Iop_Shl32, mkexpr(irt_regM), mkU8(16)), 
+                              mkU8(16) ) ) );
+        armSignedSatQ( irt_diff, 0x10, &irt_diff_res, NULL);
+
+        assign( irt_sum, 
+                binop( Iop_Add32, 
+                       binop( Iop_Sar32, 
+                              binop( Iop_Shl32, mkexpr(irt_regN), mkU8(16) ), 
+                              mkU8(16) ), 
+                       binop( Iop_Sar32, mkexpr(irt_regM), mkU8(16) )) );
+        armSignedSatQ( irt_sum, 0x10, &irt_sum_res, NULL );
+
+        IRExpr* ire_result = binop( Iop_Or32, 
+                                    binop( Iop_Shl32, mkexpr(irt_diff_res), 
+                                           mkU8(16) ), 
+                                    binop( Iop_And32, mkexpr(irt_sum_res), 
+                                           mkU32(0xFFFF)) );
+
+        if (isT) 
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP( "qsax%s r%u, r%u, r%u\n", nCC(conq), regD, regN, regM );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------- qasx<c> <Rd>,<Rn>,<Rm> ------------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAA && (INSNT1(15,0) & 0xF0F0) == 0xF010) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,0,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN     = newTemp(Ity_I32);
+        IRTemp irt_regM     = newTemp(Ity_I32);
+        IRTemp irt_sum      = newTemp(Ity_I32);
+        IRTemp irt_diff     = newTemp(Ity_I32);
+        IRTemp irt_res_sum  = newTemp(Ity_I32);
+        IRTemp irt_res_diff = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( irt_regM, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign( irt_diff,  
+                binop( Iop_Sub32, 
+                       binop( Iop_Sar32, 
+                              binop( Iop_Shl32, mkexpr(irt_regN), mkU8(16) ), 
+                              mkU8(16) ), 
+                       binop( Iop_Sar32, mkexpr(irt_regM), mkU8(16) ) ) );
+        armSignedSatQ( irt_diff, 0x10, &irt_res_diff, NULL );
+
+        assign( irt_sum, 
+                binop( Iop_Add32, 
+                       binop( Iop_Sar32, mkexpr(irt_regN), mkU8(16) ), 
+                       binop( Iop_Sar32, 
+                              binop( Iop_Shl32, mkexpr(irt_regM), mkU8(16) ), 
+                              mkU8(16) ) ) );
+        armSignedSatQ( irt_sum, 0x10, &irt_res_sum, NULL );
+       
+        IRExpr* ire_result 
+          = binop( Iop_Or32, 
+                   binop( Iop_Shl32, mkexpr(irt_res_sum), mkU8(16) ), 
+                   binop( Iop_And32, mkexpr(irt_res_diff), mkU32(0xFFFF) ) );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP( "qasx%s r%u, r%u, r%u\n", nCC(conq), regD, regN, regM );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------- sasx<c> <Rd>,<Rn>,<Rm> ------------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAA && (INSNT1(15,0) & 0xF0F0) == 0xF000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,0,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,0,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN = newTemp(Ity_I32);
+        IRTemp irt_regM = newTemp(Ity_I32);
+        IRTemp irt_sum  = newTemp(Ity_I32);
+        IRTemp irt_diff = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( irt_regM, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign( irt_diff,  
+                binop( Iop_Sub32, 
+                       binop( Iop_Sar32, 
+                              binop( Iop_Shl32, mkexpr(irt_regN), mkU8(16) ), 
+                              mkU8(16) ), 
+                       binop( Iop_Sar32, mkexpr(irt_regM), mkU8(16) ) ) );
+
+        assign( irt_sum, 
+                binop( Iop_Add32, 
+                       binop( Iop_Sar32, mkexpr(irt_regN), mkU8(16) ), 
+                       binop( Iop_Sar32, 
+                              binop( Iop_Shl32, mkexpr(irt_regM), mkU8(16) ), 
+                              mkU8(16) ) ) );
+       
+        IRExpr* ire_result 
+          = binop( Iop_Or32, 
+                   binop( Iop_Shl32, mkexpr(irt_sum), mkU8(16) ), 
+                   binop( Iop_And32, mkexpr(irt_diff), mkU32(0xFFFF) ) );
+
+        IRTemp ge10 = newTemp(Ity_I32);
+        assign(ge10, unop(Iop_Not32, mkexpr(irt_diff)));
+        put_GEFLAG32( 0, 31, mkexpr(ge10), condT );
+        put_GEFLAG32( 1, 31, mkexpr(ge10), condT );
+
+        IRTemp ge32 = newTemp(Ity_I32);
+        assign(ge32, unop(Iop_Not32, mkexpr(irt_sum)));
+        put_GEFLAG32( 2, 31, mkexpr(ge32), condT );
+        put_GEFLAG32( 3, 31, mkexpr(ge32), condT );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP( "sasx%s r%u, r%u, r%u\n", nCC(conq), regD, regN, regM );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* --------------- smuad, smuadx<c><Rd>,<Rn>,<Rm> --------------- */
+   /* --------------- smsad, smsadx<c><Rd>,<Rn>,<Rm> --------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99, bitM = 99;
+     Bool gate = False, isAD = False;
+
+     if (isT) {
+        if ((INSNT0(15,4) == 0xFB2 || INSNT0(15,4) == 0xFB4)
+            && (INSNT1(15,0) & 0xF0E0) == 0xF000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           bitM = INSNT1(4,4);
+           isAD = INSNT0(15,4) == 0xFB2;
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,1,0,0,0,0) &&
+            INSNA(15,12) == BITS4(1,1,1,1)         &&
+            (INSNA(7,4) & BITS4(1,0,0,1)) == BITS4(0,0,0,1) ) {
+           regD = INSNA(19,16);
+           regN = INSNA(3,0);
+           regM = INSNA(11,8);
+           bitM = INSNA(5,5);
+           isAD = INSNA(6,6) == 0;
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN    = newTemp(Ity_I32);
+        IRTemp irt_regM    = newTemp(Ity_I32);
+        IRTemp irt_prod_lo = newTemp(Ity_I32);
+        IRTemp irt_prod_hi = newTemp(Ity_I32);
+        IRTemp tmpM        = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+
+        assign( tmpM, isT ? getIRegT(regM) : getIRegA(regM) );
+        assign( irt_regM, genROR32(tmpM, (bitM & 1) ? 16 : 0) );
+
+        assign( irt_prod_lo, 
+                binop( Iop_Mul32, 
+                       binop( Iop_Sar32, 
+                              binop(Iop_Shl32, mkexpr(irt_regN), mkU8(16)), 
+                              mkU8(16) ), 
+                       binop( Iop_Sar32, 
+                              binop(Iop_Shl32, mkexpr(irt_regM), mkU8(16)), 
+                              mkU8(16) ) ) );
+        assign( irt_prod_hi, binop(Iop_Mul32, 
+                                   binop(Iop_Sar32, mkexpr(irt_regN), mkU8(16)), 
+                                   binop(Iop_Sar32, mkexpr(irt_regM), mkU8(16))) );
+        IRExpr* ire_result 
+           = binop( isAD ? Iop_Add32 : Iop_Sub32,
+                    mkexpr(irt_prod_lo), mkexpr(irt_prod_hi) );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        if (isAD) {
+           or_into_QFLAG32(
+              signed_overflow_after_Add32( ire_result,
+                                           irt_prod_lo, irt_prod_hi ),
+              condT
+           );
+        }
+
+        DIP("smu%cd%s%s r%u, r%u, r%u\n",
+            isAD ? 'a' : 's',
+            bitM ? "x" : "", nCC(conq), regD, regN, regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* --------------- smlad{X}<c> <Rd>,<Rn>,<Rm>,<Ra> -------------- */
+   /* --------------- smlsd{X}<c> <Rd>,<Rn>,<Rm>,<Ra> -------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99, regA = 99, bitM = 99;
+     Bool gate = False, isAD = False;
+
+     if (isT) {
+       if ((INSNT0(15,4) == 0xFB2 || INSNT0(15,4) == 0xFB4)
+           && INSNT1(7,5) == BITS3(0,0,0)) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           regA = INSNT1(15,12);
+           bitM = INSNT1(4,4);
+           isAD = INSNT0(15,4) == 0xFB2;
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM)
+               && !isBadRegT(regA))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,1,0,0,0,0) &&
+            (INSNA(7,4) & BITS4(1,0,0,1)) == BITS4(0,0,0,1)) {
+           regD = INSNA(19,16);
+           regA = INSNA(15,12);
+           regN = INSNA(3,0);
+           regM = INSNA(11,8);
+           bitM = INSNA(5,5);
+           isAD = INSNA(6,6) == 0;
+           if (regD != 15 && regN != 15 && regM != 15 && regA != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN    = newTemp(Ity_I32);
+        IRTemp irt_regM    = newTemp(Ity_I32);
+        IRTemp irt_regA    = newTemp(Ity_I32);
+        IRTemp irt_prod_lo = newTemp(Ity_I32);
+        IRTemp irt_prod_hi = newTemp(Ity_I32);
+        IRTemp irt_sum     = newTemp(Ity_I32);
+        IRTemp tmpM        = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( irt_regA, isT ? getIRegT(regA) : getIRegA(regA) );
+
+        assign( tmpM, isT ? getIRegT(regM) : getIRegA(regM) );
+        assign( irt_regM, genROR32(tmpM, (bitM & 1) ? 16 : 0) );
+
+        assign( irt_prod_lo, 
+                binop(Iop_Mul32, 
+                      binop(Iop_Sar32, 
+                            binop( Iop_Shl32, mkexpr(irt_regN), mkU8(16) ), 
+                            mkU8(16)), 
+                      binop(Iop_Sar32, 
+                            binop( Iop_Shl32, mkexpr(irt_regM), mkU8(16) ), 
+                            mkU8(16))) );
+        assign( irt_prod_hi, 
+                binop( Iop_Mul32, 
+                       binop( Iop_Sar32, mkexpr(irt_regN), mkU8(16) ), 
+                       binop( Iop_Sar32, mkexpr(irt_regM), mkU8(16) ) ) );
+        assign( irt_sum, binop( isAD ? Iop_Add32 : Iop_Sub32, 
+                                mkexpr(irt_prod_lo), mkexpr(irt_prod_hi) ) );
+
+        IRExpr* ire_result = binop(Iop_Add32, mkexpr(irt_sum), mkexpr(irt_regA));
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        if (isAD) {
+           or_into_QFLAG32(
+              signed_overflow_after_Add32( mkexpr(irt_sum),
+                                           irt_prod_lo, irt_prod_hi ),
+              condT
+           );
+        }
+
+        or_into_QFLAG32(
+           signed_overflow_after_Add32( ire_result, irt_sum, irt_regA ),
+           condT
+        );
+
+        DIP("sml%cd%s%s r%u, r%u, r%u, r%u\n",
+            isAD ? 'a' : 's',
+            bitM ? "x" : "", nCC(conq), regD, regN, regM, regA);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ----- smlabb, smlabt, smlatb, smlatt <Rd>,<Rn>,<Rm>,<Ra> ----- */
+   {
+     UInt regD = 99, regN = 99, regM = 99, regA = 99, bitM = 99, bitN = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFB1 && INSNT1(7,6) == BITS2(0,0)) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           regA = INSNT1(15,12);
+           bitM = INSNT1(4,4);
+           bitN = INSNT1(5,5);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM)
+               && !isBadRegT(regA))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,0,0,1,0,0,0,0) &&
+            (INSNA(7,4) & BITS4(1,0,0,1)) == BITS4(1,0,0,0)) {
+           regD = INSNA(19,16);
+           regN = INSNA(3,0);
+           regM = INSNA(11,8);
+           regA = INSNA(15,12);
+           bitM = INSNA(6,6);
+           bitN = INSNA(5,5);
+           if (regD != 15 && regN != 15 && regM != 15 && regA != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regA = newTemp(Ity_I32);
+        IRTemp irt_prod = newTemp(Ity_I32);
+
+        assign( irt_prod, 
+                binop(Iop_Mul32, 
+                      binop(Iop_Sar32, 
+                            binop(Iop_Shl32,
+                                  isT ? getIRegT(regN) : getIRegA(regN),
+                                  mkU8(bitN ? 0 : 16)),
+                            mkU8(16)), 
+                      binop(Iop_Sar32, 
+                            binop(Iop_Shl32,
+                                  isT ? getIRegT(regM) : getIRegA(regM),
+                                  mkU8(bitM ? 0 : 16)), 
+                            mkU8(16))) );
+
+        assign( irt_regA, isT ? getIRegT(regA) : getIRegA(regA) );
+
+        IRExpr* ire_result = binop(Iop_Add32, mkexpr(irt_prod), mkexpr(irt_regA));
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        or_into_QFLAG32(
+           signed_overflow_after_Add32( ire_result, irt_prod, irt_regA ),
+           condT
+        );
+
+        DIP( "smla%c%c%s r%u, r%u, r%u, r%u\n", 
+             bitN ? 't' : 'b', bitM ? 't' : 'b', 
+             nCC(conq), regD, regN, regM, regA );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ----- smlawb, smlawt <Rd>,<Rn>,<Rm>,<Ra> ----- */
+   {
+     UInt regD = 99, regN = 99, regM = 99, regA = 99, bitM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFB3 && INSNT1(7,5) == BITS3(0,0,0)) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           regA = INSNT1(15,12);
+           bitM = INSNT1(4,4);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM)
+               && !isBadRegT(regA))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,0,0,1,0,0,1,0) &&
+            (INSNA(7,4) & BITS4(1,0,1,1)) == BITS4(1,0,0,0)) {
+           regD = INSNA(19,16);
+           regN = INSNA(3,0);
+           regM = INSNA(11,8);
+           regA = INSNA(15,12);
+           bitM = INSNA(6,6);
+           if (regD != 15 && regN != 15 && regM != 15 && regA != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regA = newTemp(Ity_I32);
+        IRTemp irt_prod = newTemp(Ity_I64);
+
+        assign( irt_prod, 
+                binop(Iop_MullS32, 
+                      isT ? getIRegT(regN) : getIRegA(regN),
+                      binop(Iop_Sar32, 
+                            binop(Iop_Shl32,
+                                  isT ? getIRegT(regM) : getIRegA(regM),
+                                  mkU8(bitM ? 0 : 16)), 
+                            mkU8(16))) );
+
+        assign( irt_regA, isT ? getIRegT(regA) : getIRegA(regA) );
+
+        IRTemp prod32 = newTemp(Ity_I32);
+        assign(prod32,
+               binop(Iop_Or32,
+                     binop(Iop_Shl32, unop(Iop_64HIto32, mkexpr(irt_prod)), mkU8(16)),
+                     binop(Iop_Shr32, unop(Iop_64to32, mkexpr(irt_prod)), mkU8(16))
+        ));
+
+        IRExpr* ire_result = binop(Iop_Add32, mkexpr(prod32), mkexpr(irt_regA));
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        or_into_QFLAG32(
+           signed_overflow_after_Add32( ire_result, prod32, irt_regA ),
+           condT
+        );
+
+        DIP( "smlaw%c%s r%u, r%u, r%u, r%u\n", 
+             bitM ? 't' : 'b', 
+             nCC(conq), regD, regN, regM, regA );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------- sel<c> <Rd>,<Rn>,<Rm> -------------------- */
+   /* fixme: fix up the test in v6media.c so that we can pass the ge
+      flags as part of the test. */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAA && (INSNT1(15,0) & 0xF0F0) == 0xF080) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,1,0,0,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(1,0,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_ge_flag0 = newTemp(Ity_I32);
+        IRTemp irt_ge_flag1 = newTemp(Ity_I32);
+        IRTemp irt_ge_flag2 = newTemp(Ity_I32);
+        IRTemp irt_ge_flag3 = newTemp(Ity_I32);
+
+        assign( irt_ge_flag0, get_GEFLAG32(0) );
+        assign( irt_ge_flag1, get_GEFLAG32(1) );
+        assign( irt_ge_flag2, get_GEFLAG32(2) );
+        assign( irt_ge_flag3, get_GEFLAG32(3) );
+
+        IRExpr* ire_ge_flag0_or 
+          = binop(Iop_Or32, mkexpr(irt_ge_flag0), 
+                  binop(Iop_Sub32, mkU32(0), mkexpr(irt_ge_flag0)));
+        IRExpr* ire_ge_flag1_or 
+          = binop(Iop_Or32, mkexpr(irt_ge_flag1), 
+                  binop(Iop_Sub32, mkU32(0), mkexpr(irt_ge_flag1)));
+        IRExpr* ire_ge_flag2_or 
+          = binop(Iop_Or32, mkexpr(irt_ge_flag2), 
+                  binop(Iop_Sub32, mkU32(0), mkexpr(irt_ge_flag2)));
+        IRExpr* ire_ge_flag3_or 
+          = binop(Iop_Or32, mkexpr(irt_ge_flag3), 
+                  binop(Iop_Sub32, mkU32(0), mkexpr(irt_ge_flag3)));
+
+        IRExpr* ire_ge_flags 
+          = binop( Iop_Or32, 
+                   binop(Iop_Or32, 
+                         binop(Iop_And32, 
+                               binop(Iop_Sar32, ire_ge_flag0_or, mkU8(31)), 
+                               mkU32(0x000000ff)), 
+                         binop(Iop_And32, 
+                               binop(Iop_Sar32, ire_ge_flag1_or, mkU8(31)), 
+                               mkU32(0x0000ff00))), 
+                   binop(Iop_Or32, 
+                         binop(Iop_And32, 
+                               binop(Iop_Sar32, ire_ge_flag2_or, mkU8(31)), 
+                               mkU32(0x00ff0000)), 
+                         binop(Iop_And32, 
+                               binop(Iop_Sar32, ire_ge_flag3_or, mkU8(31)), 
+                               mkU32(0xff000000))) );
+
+        IRExpr* ire_result 
+          = binop(Iop_Or32, 
+                  binop(Iop_And32,
+                        isT ? getIRegT(regN) : getIRegA(regN),
+                        ire_ge_flags ), 
+                  binop(Iop_And32,
+                        isT ? getIRegT(regM) : getIRegA(regM),
+                        unop(Iop_Not32, ire_ge_flags)));
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP("sel%s r%u, r%u, r%u\n", nCC(conq), regD, regN, regM );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ----------------- uxtab16<c> Rd,Rn,Rm{,rot} ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99, rotate = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA3 && (INSNT1(15,0) & 0xF0C0) == 0xF080) {
+           regN   = INSNT0(3,0);
+           regD   = INSNT1(11,8);
+           regM   = INSNT1(3,0);
+           rotate = INSNT1(5,4);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,1,1,0,0) &&
+            INSNA(9,4)   == BITS6(0,0,0,1,1,1) ) {
+           regD   = INSNA(15,12);
+           regN   = INSNA(19,16);
+           regM   = INSNA(3,0);
+           rotate = INSNA(11,10);
+           if (regD != 15 && regN != 15 && regM != 15)
+             gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN = newTemp(Ity_I32);
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+
+        IRTemp irt_regM = newTemp(Ity_I32);
+        assign( irt_regM, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        IRTemp irt_rot = newTemp(Ity_I32);
+        assign( irt_rot, binop(Iop_And32,
+                               genROR32(irt_regM, 8 * rotate),
+                               mkU32(0x00FF00FF)) );
+
+        IRExpr* resLo
+           = binop(Iop_And32,
+                   binop(Iop_Add32, mkexpr(irt_regN), mkexpr(irt_rot)),
+                   mkU32(0x0000FFFF));
+
+        IRExpr* resHi
+           = binop(Iop_Add32, 
+                   binop(Iop_And32, mkexpr(irt_regN), mkU32(0xFFFF0000)),
+                   binop(Iop_And32, mkexpr(irt_rot),  mkU32(0xFFFF0000)));
+
+        IRExpr* ire_result 
+           = binop( Iop_Or32, resHi, resLo );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP( "uxtab16%s r%u, r%u, r%u, ROR #%u\n", 
+             nCC(conq), regD, regN, regM, 8 * rotate );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* --------------- usad8  Rd,Rn,Rm    ---------------- */
+   /* --------------- usada8 Rd,Rn,Rm,Ra ---------------- */
+   {
+     UInt rD = 99, rN = 99, rM = 99, rA = 99;
+     Bool gate = False;
+
+     if (isT) {
+       if (INSNT0(15,4) == 0xFB7 && INSNT1(7,4) == BITS4(0,0,0,0)) {
+           rN = INSNT0(3,0);
+           rA = INSNT1(15,12);
+           rD = INSNT1(11,8);
+           rM = INSNT1(3,0);
+           if (!isBadRegT(rD) && !isBadRegT(rN) && !isBadRegT(rM) && rA != 13)
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,1,1,0,0,0) &&
+            INSNA(7,4)   == BITS4(0,0,0,1) ) {
+           rD = INSNA(19,16);
+           rA = INSNA(15,12);
+           rM = INSNA(11,8);
+           rN = INSNA(3,0);
+           if (rD != 15 && rN != 15 && rM != 15 /* but rA can be 15 */)
+              gate = True;
+        }
+     }
+     /* We allow rA == 15, to denote the usad8 (no accumulator) case. */
+
+     if (gate) {
+        IRExpr* rNe = isT ? getIRegT(rN) : getIRegA(rN);
+        IRExpr* rMe = isT ? getIRegT(rM) : getIRegA(rM);
+        IRExpr* rAe = rA == 15 ? mkU32(0)
+                               : (isT ? getIRegT(rA) : getIRegA(rA)); 
+        IRExpr* res = binop(Iop_Add32,
+                            binop(Iop_Sad8Ux4, rNe, rMe),
+                            rAe);
+        if (isT)
+           putIRegT( rD, res, condT );
+        else
+           putIRegA( rD, res, condT, Ijk_Boring );
+
+        if (rA == 15) {
+           DIP( "usad8%s r%u, r%u, r%u\n", 
+                nCC(conq), rD, rN, rM );
+        } else {
+           DIP( "usada8%s r%u, r%u, r%u, r%u\n", 
+                nCC(conq), rD, rN, rM, rA );
+        }
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ---------- Doesn't match anything. ---------- */
+   return False;
+
+#  undef INSNA
+#  undef INSNT0
+#  undef INSNT1
+}
+
+
+/*------------------------------------------------------------*/
+/*--- LDMxx/STMxx helper (both ARM and Thumb32)            ---*/
+/*------------------------------------------------------------*/
+
+/* Generate IR for LDMxx and STMxx.  This is complex.  Assumes it's
+   unconditional, so the caller must produce a jump-around before
+   calling this, if the insn is to be conditional.  Caller is
+   responsible for all validation of parameters.  For LDMxx, if PC is
+   amongst the values loaded, caller is also responsible for
+   generating the jump. */
+static void mk_ldm_stm ( Bool arm,     /* True: ARM, False: Thumb */
+                         UInt rN,      /* base reg */
+                         UInt bINC,    /* 1: inc,  0: dec */
+                         UInt bBEFORE, /* 1: inc/dec before, 0: after */
+                         UInt bW,      /* 1: writeback to Rn */
+                         UInt bL,      /* 1: load, 0: store */
+                         UInt regList )
+{
+   Int i, r, m, nRegs;
+
+   /* Get hold of the old Rn value.  We might need to write its value
+      to memory during a store, and if it's also the writeback
+      register then we need to get its value now.  We can't treat it
+      exactly like the other registers we're going to transfer,
+      because for xxMDA and xxMDB writeback forms, the generated IR
+      updates Rn in the guest state before any transfers take place.
+      We have to do this as per comments below, in order that if Rn is
+      the stack pointer then it always has a value is below or equal
+      to any of the transfer addresses.  Ick. */
+   IRTemp oldRnT = newTemp(Ity_I32);
+   assign(oldRnT, arm ? getIRegA(rN) : getIRegT(rN));
+
+   IRTemp anchorT = newTemp(Ity_I32);
+   /* The old (Addison-Wesley) ARM ARM seems to say that LDMxx/STMxx
+      ignore the bottom two bits of the address.  However, Cortex-A8
+      doesn't seem to care.  Hence: */
+   /* No .. don't force alignment .. */
+   /* assign(anchorT, binop(Iop_And32, mkexpr(oldRnT), mkU32(~3U))); */
+   /* Instead, use the potentially misaligned address directly. */
+   assign(anchorT, mkexpr(oldRnT));
+
+   IROp opADDorSUB = bINC ? Iop_Add32 : Iop_Sub32;
+   // bINC == 1:  xxMIA, xxMIB
+   // bINC == 0:  xxMDA, xxMDB
+
+   // For xxMDA and xxMDB, update Rn first if necessary.  We have
+   // to do this first so that, for the common idiom of the transfers
+   // faulting because we're pushing stuff onto a stack and the stack
+   // is growing down onto allocate-on-fault pages (as Valgrind simulates),
+   // we need to have the SP up-to-date "covering" (pointing below) the
+   // transfer area.  For the same reason, if we are doing xxMIA or xxMIB,
+   // do the transfer first, and then update rN afterwards.
+   nRegs = 0;
+   for (i = 0; i < 16; i++) {
+     if ((regList & (1 << i)) != 0)
+         nRegs++;
+   }
+   if (bW == 1 && !bINC) {
+      IRExpr* e = binop(opADDorSUB, mkexpr(oldRnT), mkU32(4*nRegs));
+      if (arm)
+         putIRegA( rN, e, IRTemp_INVALID, Ijk_Boring );
+      else
+         putIRegT( rN, e, IRTemp_INVALID );
+   }
+
+   // Make up a list of the registers to transfer, and their offsets
+   // in memory relative to the anchor.  If the base reg (Rn) is part
+   // of the transfer, then do it last for a load and first for a store.
+   UInt xReg[16], xOff[16];
+   Int  nX = 0;
+   m = 0;
+   for (i = 0; i < 16; i++) {
+      r = bINC ? i : (15-i);
+      if (0 == (regList & (1<<r)))
+         continue;
+      if (bBEFORE)
+         m++;
+      /* paranoia: check we aren't transferring the writeback
+         register during a load. Should be assured by decode-point
+         check above. */
+      if (bW == 1 && bL == 1)
+         vassert(r != rN);
+
+      xOff[nX] = 4 * m;
+      xReg[nX] = r;
+      nX++;
+
+      if (!bBEFORE)
+         m++;
+   }
+   vassert(m == nRegs);
+   vassert(nX == nRegs);
+   vassert(nX <= 16);
+
+   if (bW == 0 && (regList & (1<<rN)) != 0) {
        /* Non-writeback, and basereg is to be transferred.  Do its
           transfer last for a load and first for a store.  Requires
           reordering xOff/xReg. */
@@ -8433,7 +10758,7 @@ static Bool decode_CP10_CP11_instruction (
        UInt dM = INSN(3,0) | (INSN(5,5) << 4);
        UInt rD = INSN(15,12); /* lo32 */
        UInt rN = INSN(19,16); /* hi32 */
-      if (rD == 15 || rN == 15) {
+      if (rD == 15 || rN == 15 || (isT && (rD == 13 || rN == 13))) {
           /* fall through */
        } else {
           putDReg(dM,
@@ -8445,41 +10770,87 @@ static Bool decode_CP10_CP11_instruction (
           DIP("vmov%s d%u, r%u, r%u\n", nCC(conq), dM, rD, rN);
           goto decode_success_vfp;
        }
-      /* fall through */
+      /* fall through */
+   }
+
+   // VMOV rD, rN, dM
+   if (0x0C500B10 == (insn28 & 0x0FF00FD0)) {
+      UInt dM = INSN(3,0) | (INSN(5,5) << 4);
+      UInt rD = INSN(15,12); /* lo32 */
+      UInt rN = INSN(19,16); /* hi32 */
+      if (rD == 15 || rN == 15 || (isT && (rD == 13 || rN == 13))
+          || rD == rN) {
+         /* fall through */
+      } else {
+         IRTemp i64 = newTemp(Ity_I64);
+         assign(i64, unop(Iop_ReinterpF64asI64, getDReg(dM)));
+         IRExpr* hi32 = unop(Iop_64HIto32, mkexpr(i64));
+         IRExpr* lo32 = unop(Iop_64to32,   mkexpr(i64));
+         if (isT) {
+            putIRegT(rN, hi32, condT);
+            putIRegT(rD, lo32, condT);
+         } else {
+            putIRegA(rN, hi32, condT, Ijk_Boring);
+            putIRegA(rD, lo32, condT, Ijk_Boring);
+         }
+         DIP("vmov%s r%u, r%u, d%u\n", nCC(conq), rD, rN, dM);
+         goto decode_success_vfp;
+      }
+      /* fall through */
+   }
+
+   // VMOV sD, sD+1, rN, rM
+   if (0x0C400A10 == (insn28 & 0x0FF00FD0)) {
+      UInt sD = (INSN(3,0) << 1) | INSN(5,5);
+      UInt rN = INSN(15,12);
+      UInt rM = INSN(19,16);
+      if (rM == 15 || rN == 15 || (isT && (rM == 13 || rN == 13))
+          || sD == 31) {
+         /* fall through */
+      } else {
+         putFReg(sD,
+                 unop(Iop_ReinterpI32asF32, isT ? getIRegT(rN) : getIRegA(rN)),
+                 condT);
+         putFReg(sD+1,
+                 unop(Iop_ReinterpI32asF32, isT ? getIRegT(rM) : getIRegA(rM)),
+                 condT);
+         DIP("vmov%s, s%u, s%u, r%u, r%u\n",
+              nCC(conq), sD, sD + 1, rN, rM);
+         goto decode_success_vfp;
+      }
     }
  
-   // VMOV rD, rN, dM
-   if (0x0C500B10 == (insn28 & 0x0FF00FD0)) {
-      UInt dM = INSN(3,0) | (INSN(5,5) << 4);
-      UInt rD = INSN(15,12); /* lo32 */
-      UInt rN = INSN(19,16); /* hi32 */
-      if (rD == 15 || rN == 15 || rD == rN) {
+   // VMOV rN, rM, sD, sD+1
+   if (0x0C500A10 == (insn28 & 0x0FF00FD0)) {
+      UInt sD = (INSN(3,0) << 1) | INSN(5,5);
+      UInt rN = INSN(15,12);
+      UInt rM = INSN(19,16);
+      if (rM == 15 || rN == 15 || (isT && (rM == 13 || rN == 13))
+          || sD == 31 || rN == rM) {
           /* fall through */
        } else {
-         IRTemp i64 = newTemp(Ity_I64);
-         assign(i64, unop(Iop_ReinterpF64asI64, getDReg(dM)));
-         IRExpr* hi32 = unop(Iop_64HIto32, mkexpr(i64));
-         IRExpr* lo32 = unop(Iop_64to32,   mkexpr(i64));
+         IRExpr* res0 = unop(Iop_ReinterpF32asI32, getFReg(sD));
+         IRExpr* res1 = unop(Iop_ReinterpF32asI32, getFReg(sD+1));
           if (isT) {
-            putIRegT(rN, hi32, condT);
-            putIRegT(rD, lo32, condT);
+            putIRegT(rN, res0, condT);
+            putIRegT(rM, res1, condT);
           } else {
-            putIRegA(rN, hi32, condT, Ijk_Boring);
-            putIRegA(rD, lo32, condT, Ijk_Boring);
+            putIRegA(rN, res0, condT, Ijk_Boring);
+            putIRegA(rM, res1, condT, Ijk_Boring);
           }
-         DIP("vmov%s r%u, r%u, d%u\n", nCC(conq), rD, rN, dM);
+         DIP("vmov%s, r%u, r%u, s%u, s%u\n",
+             nCC(conq), rN, rM, sD, sD + 1);
           goto decode_success_vfp;
        }
-      /* fall through */
     }
  
-   // VMOV rD[x], rT
+   // VMOV rD[x], rT  (ARM core register to scalar)
     if (0x0E000B10 == (insn28 & 0x0F900F1F)) {
        UInt rD  = (INSN(7,7) << 4) | INSN(19,16);
        UInt rT  = INSN(15,12);
        UInt opc = (INSN(22,21) << 2) | INSN(6,5);
        UInt index;
-      if (rT == 15) {
+      if (rT == 15 || (isT && rT == 13)) {
           /* fall through */
        } else {
           if ((opc & BITS4(1,0,0,0)) == BITS4(1,0,0,0)) {
@@ -8519,6 +10890,7 @@ static Bool decode_CP10_CP11_instruction (
        }
     }
  
+   // VMOV (scalar to ARM core register)
     // VMOV rT, rD[x]
     if (0x0E100B10 == (insn28 & 0x0F100F1F)) {
        UInt rN  = (INSN(7,7) << 4) | INSN(19,16);
@@ -8526,7 +10898,7 @@ static Bool decode_CP10_CP11_instruction (
        UInt U   = INSN(23,23);
        UInt opc = (INSN(22,21) << 2) | INSN(6,5);
        UInt index;
-      if (rT == 15) {
+      if (rT == 15 || (isT && rT == 13)) {
           /* fall through */
        } else {
           if ((opc & BITS4(1,0,0,0)) == BITS4(1,0,0,0)) {
@@ -8613,7 +10985,7 @@ static Bool decode_CP10_CP11_instruction (
        UInt rT   = INSN(15,12);
        UInt Q    = INSN(21,21);
        UInt size = (INSN(22,22) << 1) | INSN(5,5);
-      if (rT == 15 || size == 3i || (Q && (rD & 1))) {
+      if (rT == 15 || (isT && rT == 13) || size == 3 || (Q && (rD & 1))) {
           /* fall through */
        } else {
           IRExpr* e = isT ? getIRegT(rT) : getIRegA(rT);
@@ -8659,9 +11031,9 @@ static Bool decode_CP10_CP11_instruction (
  
     /* --------------------- f{ld,st}d --------------------- */
     // FLDD, FSTD
-   if (BITS8(1,1,0,1,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,0,1,1,0))
+   if (BITS8(1,1,0,1,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,0,0,1,0))
         && BITS4(1,0,1,1) == INSN(11,8)) {
-      UInt dD     = INSN(15,12);
+      UInt dD     = INSN(15,12) | (INSN(22,22) << 4);
        UInt rN     = INSN(19,16);
        UInt offset = (insn28 & 0xFF) << 2;
        UInt bU     = (insn28 >> 23) & 1; /* 1: +offset  0: -offset */
@@ -8691,12 +11063,12 @@ static Bool decode_CP10_CP11_instruction (
     }
  
     /* --------------------- dp insns (D) --------------------- */
-   if (BITS8(1,1,1,0,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,0,1,0,0))
+   if (BITS8(1,1,1,0,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,0,0,0,0))
         && BITS4(1,0,1,1) == INSN(11,8)
-       && BITS4(0,0,0,0) == (INSN(7,4) & BITS4(1,0,1,1))) {
-      UInt    dM  = INSN(3,0);   /* argR */
-      UInt    dD  = INSN(15,12); /* dst/acc */
-      UInt    dN  = INSN(19,16); /* argL */
+       && BITS4(0,0,0,0) == (INSN(7,4) & BITS4(0,0,0,1))) {
+      UInt    dM  = INSN(3,0)   | (INSN(5,5) << 4);       /* argR */
+      UInt    dD  = INSN(15,12) | (INSN(22,22) << 4);   /* dst/acc */
+      UInt    dN  = INSN(19,16) | (INSN(7,7) << 4);     /* argL */
        UInt    bP  = (insn28 >> 23) & 1;
        UInt    bQ  = (insn28 >> 21) & 1;
        UInt    bR  = (insn28 >> 20) & 1;
@@ -8712,11 +11084,12 @@ static Bool decode_CP10_CP11_instruction (
                          condT);
              DIP("fmacd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
              goto decode_success_vfp;
-         case BITS4(0,0,0,1): /* NMAC: d - n * m */
-            putDReg(dD, triop(Iop_SubF64, rm,
+         case BITS4(0,0,0,1): /* NMAC: d + -(n * m) */
+            putDReg(dD, triop(Iop_AddF64, rm,
                                getDReg(dD),
-                              triop(Iop_MulF64, rm, getDReg(dN),
-                                                    getDReg(dM))),
+                              unop(Iop_NegF64,
+                                   triop(Iop_MulF64, rm, getDReg(dN),
+                                                         getDReg(dM)))),
                          condT);
              DIP("fnmacd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
              goto decode_success_vfp;
@@ -8728,11 +11101,12 @@ static Bool decode_CP10_CP11_instruction (
                          condT);
              DIP("fmscd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
              goto decode_success_vfp;
-         case BITS4(0,0,1,1): /* NMSC: - d - n * m */
-            putDReg(dD, triop(Iop_SubF64, rm,
+         case BITS4(0,0,1,1): /* NMSC: - d + -(n * m) */
+            putDReg(dD, triop(Iop_AddF64, rm,
                                unop(Iop_NegF64, getDReg(dD)),
-                              triop(Iop_MulF64, rm, getDReg(dN),
-                                                    getDReg(dM))),
+                              unop(Iop_NegF64,
+                                   triop(Iop_MulF64, rm, getDReg(dN),
+                                                         getDReg(dM)))),
                          condT);
              DIP("fnmscd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
              goto decode_success_vfp;
@@ -8771,10 +11145,10 @@ static Bool decode_CP10_CP11_instruction (
     /* --------------------- compares (D) --------------------- */
     /*          31   27   23   19   15 11   7    3
                   28   24   20   16 12    8    4    0 
-      FCMPD    cond 1110 1011 0100 Dd 1011 0100 Dm
-      FCMPED   cond 1110 1011 0100 Dd 1011 1100 Dm
-      FCMPZD   cond 1110 1011 0101 Dd 1011 0100 0000
-      FCMPZED  cond 1110 1011 0101 Dd 1011 1100 0000
+      FCMPD    cond 1110 1D11 0100 Dd 1011 0100 Dm
+      FCMPED   cond 1110 1D11 0100 Dd 1011 1100 Dm
+      FCMPZD   cond 1110 1D11 0101 Dd 1011 0100 0000
+      FCMPZED  cond 1110 1D11 0101 Dd 1011 1100 0000
                                   Z         N
  
        Z=0 Compare Dd vs Dm     and set FPSCR 31:28 accordingly
@@ -8784,14 +11158,14 @@ static Bool decode_CP10_CP11_instruction (
        N=0 generates Invalid Operation exn if either arg is a signalling NaN
        (Not that we pay any attention to N here)
     */
-   if (BITS8(1,1,1,0,1,0,1,1) == INSN(27,20)
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
         && BITS4(0,1,0,0) == (INSN(19,16) & BITS4(1,1,1,0))
         && BITS4(1,0,1,1) == INSN(11,8)
-       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,1,1))) {
+       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
        UInt bZ = (insn28 >> 16) & 1;
        UInt bN = (insn28 >> 7) & 1;
-      UInt dD = INSN(15,12);
-      UInt dM = INSN(3,0);
+      UInt dD = INSN(15,12) | (INSN(22,22) << 4);
+      UInt dM = INSN(3,0) | (INSN(5,5) << 4);
        if (bZ && INSN(3,0) != 0) {
           /* does not decode; fall through */
        } else {
@@ -8841,12 +11215,12 @@ static Bool decode_CP10_CP11_instruction (
     }  
  
     /* --------------------- unary (D) --------------------- */
-   if (BITS8(1,1,1,0,1,0,1,1) == INSN(27,20)
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
         && BITS4(0,0,0,0) == (INSN(19,16) & BITS4(1,1,1,0))
         && BITS4(1,0,1,1) == INSN(11,8)
-       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,1,1))) {
-      UInt dD  = INSN(15,12);
-      UInt dM  = INSN(3,0);
+       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
+      UInt dD  = INSN(15,12) | (INSN(22,22) << 4);
+      UInt dM  = INSN(3,0) | (INSN(5,5) << 4);
        UInt b16 = (insn28 >> 16) & 1;
        UInt b7  = (insn28 >> 7) & 1;
        /**/ if (b16 == 0 && b7 == 0) {
@@ -8883,13 +11257,13 @@ static Bool decode_CP10_CP11_instruction (
     /* ----------------- I <-> D conversions ----------------- */
  
     // F{S,U}ITOD dD, fM
-   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,1,1,1))
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
         && BITS4(1,0,0,0) == (INSN(19,16) & BITS4(1,1,1,1))
         && BITS4(1,0,1,1) == INSN(11,8)
         && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
        UInt bM    = (insn28 >> 5) & 1;
        UInt fM    = (INSN(3,0) << 1) | bM;
-      UInt dD    = INSN(15,12);
+      UInt dD    = INSN(15,12) | (INSN(22,22) << 4);
        UInt syned = (insn28 >> 7) & 1;
        if (syned) {
           // FSITOD
@@ -8911,10 +11285,10 @@ static Bool decode_CP10_CP11_instruction (
     if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
         && BITS4(1,1,0,0) == (INSN(19,16) & BITS4(1,1,1,0))
         && BITS4(1,0,1,1) == INSN(11,8)
-       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,1,1))) {
+       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
        UInt   bD    = (insn28 >> 22) & 1;
        UInt   fD    = (INSN(15,12) << 1) | bD;
-      UInt   dM    = INSN(3,0);
+      UInt   dM    = INSN(3,0) | (INSN(5,5) << 4);
        UInt   bZ    = (insn28 >> 7) & 1;
        UInt   syned = (insn28 >> 16) & 1;
        IRTemp rmode = newTemp(Ity_I32);
@@ -9152,7 +11526,7 @@ static Bool decode_CP10_CP11_instruction (
  
     /* --------------------- dp insns (F) --------------------- */
     if (BITS8(1,1,1,0,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,0,0,0,0))
-       && BITS4(1,0,1,0) == INSN(11,8)
+       && BITS4(1,0,1,0) == (INSN(11,8) & BITS4(1,1,1,0))
         && BITS4(0,0,0,0) == (INSN(7,4) & BITS4(0,0,0,1))) {
        UInt    bM  = (insn28 >> 5) & 1;
        UInt    bD  = (insn28 >> 22) & 1;
@@ -9174,10 +11548,12 @@ static Bool decode_CP10_CP11_instruction (
                          condT);
              DIP("fmacs%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
              goto decode_success_vfp;
-         case BITS4(0,0,0,1): /* NMAC: d - n * m */
-            putFReg(fD, triop(Iop_SubF32, rm,
+         case BITS4(0,0,0,1): /* NMAC: d + -(n * m) */
+            putFReg(fD, triop(Iop_AddF32, rm,
                                getFReg(fD),
-                              triop(Iop_MulF32, rm, getFReg(fN), getFReg(fM))),
+                              unop(Iop_NegF32,
+                                   triop(Iop_MulF32, rm, getFReg(fN),
+                                                         getFReg(fM)))),
                          condT);
              DIP("fnmacs%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
              goto decode_success_vfp;
@@ -9188,8 +11564,16 @@ static Bool decode_CP10_CP11_instruction (
                          condT);
              DIP("fmscs%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
              goto decode_success_vfp;
-         case BITS4(0,0,1,1): /* NMSC: - d - n * m */
-            break; //ATC
+         case BITS4(0,0,1,1): /* NMSC: - d + -(n * m) */
+            putFReg(fD, triop(Iop_AddF32, rm,
+                              unop(Iop_NegF32, getFReg(fD)),
+                              unop(Iop_NegF32,
+                                   triop(Iop_MulF32, rm,
+                                                     getFReg(fN),
+                                                    getFReg(fM)))),
+                        condT);
+            DIP("fnmscs%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
+            goto decode_success_vfp;
           case BITS4(0,1,0,0): /* MUL: n * m */
              putFReg(fD, triop(Iop_MulF32, rm, getFReg(fN), getFReg(fM)),
                          condT);
@@ -9350,8 +11734,8 @@ static Bool decode_CP10_CP11_instruction (
        the case here.  Hence this case possibly requires rounding, and
        so it drags in the current rounding mode. */
     if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
-       && BITS4(1,0,0,0) == (INSN(19,16) & BITS4(1,1,1,1))
-       && BITS4(1,0,1,0) == INSN(11,8)
+       && BITS4(1,0,0,0) == INSN(19,16)
+       && BITS4(1,0,1,0) == (INSN(11,8) & BITS4(1,1,1,0))
         && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
        UInt bM    = (insn28 >> 5) & 1;
        UInt bD    = (insn28 >> 22) & 1;
@@ -9418,11 +11802,11 @@ static Bool decode_CP10_CP11_instruction (
     /* ----------------- S <-> D conversions ----------------- */
  
     // FCVTDS
-   if (BITS8(1,1,1,0,1,0,1,1) == INSN(27,20)
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
         && BITS4(0,1,1,1) == INSN(19,16)
         && BITS4(1,0,1,0) == INSN(11,8)
         && BITS4(1,1,0,0) == (INSN(7,4) & BITS4(1,1,0,1))) {
-      UInt dD = INSN(15,12);
+      UInt dD = INSN(15,12) | (INSN(22,22) << 4);
        UInt bM = (insn28 >> 5) & 1;
        UInt fM = (INSN(3,0) << 1) | bM;
        putDReg(dD, unop(Iop_F32toF64, getFReg(fM)), condT);
@@ -9434,10 +11818,10 @@ static Bool decode_CP10_CP11_instruction (
     if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
         && BITS4(0,1,1,1) == INSN(19,16)
         && BITS4(1,0,1,1) == INSN(11,8)
-       && BITS4(1,1,0,0) == INSN(7,4)) {
+       && BITS4(1,1,0,0) == (INSN(7,4) & BITS4(1,1,0,1))) {
        UInt   bD    = (insn28 >> 22) & 1;
        UInt   fD    = (INSN(15,12) << 1) | bD;
-      UInt   dM    = INSN(3,0);
+      UInt   dM    = INSN(3,0) | (INSN(5,5) << 4);
        IRTemp rmode = newTemp(Ity_I32);
        assign(rmode, mkexpr(mk_get_IR_rounding_mode()));
        putFReg(fD, binop(Iop_F64toF32, mkexpr(rmode), getDReg(dM)),
@@ -9472,7 +11856,9 @@ static Bool decode_CP10_CP11_instruction (
     Note that all NEON instructions (in ARM mode) are handled through
     here, since they are all in NV space.
  */
-static Bool decode_NV_instruction ( /*MOD*/DisResult* dres, UInt insn )
+static Bool decode_NV_instruction ( /*MOD*/DisResult* dres,
+                                    VexArchInfo* archinfo,
+                                    UInt insn )
  {
  #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
  #  define INSN_COND          SLICE_UInt(insn, 31, 28)
@@ -9515,6 +11901,16 @@ static Bool decode_NV_instruction ( /*MOD*/DisResult* dres, UInt insn )
        /* fall through */
     }
  
+   /* ------------------------ pli ------------------------ */
+   if (BITS8(0,1,0,0, 0, 1,0,1) == (INSN(27,20) & BITS8(1,1,1,1,0,1,1,1))
+       && BITS4(1,1,1,1) == INSN(15,12)) {
+      UInt rN    = INSN(19,16);
+      UInt imm12 = INSN(11,0);
+      UInt bU    = INSN(23,23);
+      DIP("pli [r%u, #%c%u]\n", rN, bU ? '+' : '-', imm12);
+      return True;
+   }
+
     /* --------------------- Interworking branches --------------------- */
  
     // BLX (1), viz, unconditional branch and link to R15+simm24
@@ -9546,11 +11942,25 @@ static Bool decode_NV_instruction ( /*MOD*/DisResult* dres, UInt insn )
           stmt( IRStmt_MBE(Imbe_Fence) );
           DIP("ISB\n");
           return True;
-      case 0xF57FF04F: /* DSB */
+      case 0xF57FF04F: /* DSB sy */
+      case 0xF57FF04E: /* DSB st */
+      case 0xF57FF04B: /* DSB ish */
+      case 0xF57FF04A: /* DSB ishst */
+      case 0xF57FF047: /* DSB nsh */
+      case 0xF57FF046: /* DSB nshst */
+      case 0xF57FF043: /* DSB osh */
+      case 0xF57FF042: /* DSB oshst */
           stmt( IRStmt_MBE(Imbe_Fence) );
           DIP("DSB\n");
           return True;
-      case 0xF57FF05F: /* DMB */
+      case 0xF57FF05F: /* DMB sy */
+      case 0xF57FF05E: /* DMB st */
+      case 0xF57FF05B: /* DMB ish */
+      case 0xF57FF05A: /* DMB ishst */
+      case 0xF57FF057: /* DMB nsh */
+      case 0xF57FF056: /* DMB nshst */
+      case 0xF57FF053: /* DMB osh */
+      case 0xF57FF052: /* DMB oshst */
           stmt( IRStmt_MBE(Imbe_Fence) );
           DIP("DMB\n");
           return True;
@@ -9559,12 +11969,14 @@ static Bool decode_NV_instruction ( /*MOD*/DisResult* dres, UInt insn )
     }
  
     /* ------------------- NEON ------------------- */
-   Bool ok = decode_NEON_instruction(
-                dres, insn, IRTemp_INVALID/*unconditional*/, 
-                False/*!isT*/
-             );
-   if (ok)
-      return True;
+   if (archinfo->hwcaps & VEX_HWCAPS_ARM_NEON) {
+      Bool ok_neon = decode_NEON_instruction(
+                        dres, insn, IRTemp_INVALID/*unconditional*/, 
+                        False/*!isT*/
+                     );
+      if (ok_neon)
+         return True;
+   }
  
     // unrecognised
     return False;
@@ -9710,7 +12122,7 @@ DisResult disInstr_ARM_WRK (
        case ARMCondNV: {
           // Illegal instruction prior to v5 (see ARM ARM A3-5), but
           // some cases are acceptable
-         Bool ok = decode_NV_instruction(&dres, insn);
+         Bool ok = decode_NV_instruction(&dres, archinfo, insn);
           if (ok)
              goto decode_success;
           else
@@ -10833,77 +13245,50 @@ DisResult disInstr_ARM_WRK (
  
     /* --------------------- Msr etc --------------------- */
  
-   // MSR cpsr_f, #imm8  (immediate form, flags only)
-   if (BITS8(0,0,1,1,0,0,1,0) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
-       && INSN(15,12) == BITS4(1,1,1,1)) {
-      UInt bitR = (insn >> 22) & 1;
-      if (bitR == 0 && INSN(19,16) == BITS4(1,0,0,0)) {
+   // MSR apsr, #imm
+   if (INSN(27,20) == BITS8(0,0,1,1,0,0,1,0)
+       && INSN(17,12) == BITS6(0,0,1,1,1,1)) {
+      UInt write_ge    = INSN(18,18);
+      UInt write_nzcvq = INSN(19,19);
+      if (write_nzcvq || write_ge) {
           UInt   imm = (INSN(11,0) >> 0) & 0xFF;
           UInt   rot = 2 * ((INSN(11,0) >> 8) & 0xF);
           IRTemp immT = newTemp(Ity_I32);
           vassert(rot <= 30);
           imm = ROR32(imm, rot);
-         imm &= 0xFF000000;
-         imm &= (ARMG_CC_MASK_N | ARMG_CC_MASK_Z 
-                 | ARMG_CC_MASK_V | ARMG_CC_MASK_C | ARMG_CC_MASK_Q);
-         assign( immT, mkU32(imm & 0xF0000000) );
-         setFlags_D1(ARMG_CC_OP_COPY, immT, condT);
-         // Set QFLAG32 to a zero or nonzero value, depending on #imm8.
-         IRTemp qnewT = newTemp(Ity_I32);
-         assign(qnewT, mkU32( imm & ARMG_CC_MASK_Q ));
-         put_QFLAG32(qnewT, condT);
-         DIP("msr%s cpsr_f, #0x%08x\n", nCC(INSN_COND), imm);
+         assign(immT, mkU32(imm));
+         desynthesise_APSR( write_nzcvq, write_ge, immT, condT );
+         DIP("msr%s cpsr%s%sf, #0x%08x\n", nCC(INSN_COND),
+             write_nzcvq ? "f" : "", write_ge ? "g" : "", imm);
           goto decode_success;
        }
        /* fall through */
     }
  
-   // MSR cpsr_f, rM  (flags only)
-   if (BITS8(0,0,0,1,0,0,1,0) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
-       && INSN(15,12) == BITS4(1,1,1,1)) {
-      UInt bitR = (insn >> 22) & 1;
-      if (bitR == 0 && INSN(19,16) == BITS4(1,0,0,0)
-          && INSN(11,4) == BITS8(0,0,0,0,0,0,0,0)
-          && INSN(3,0) != 15) {
-         UInt   rM  = INSN(3,0);
-         IRTemp rMt = newTemp(Ity_I32);
-         assign(rMt, getIRegA(rM));
-         IRTemp immT = newTemp(Ity_I32);
-         assign(immT, binop(Iop_And32, mkexpr(rMt), mkU32(0xF0000000)) );
-         setFlags_D1(ARMG_CC_OP_COPY, immT, condT);
-         IRTemp qnewT = newTemp(Ity_I32);
-         assign(qnewT, binop(Iop_And32, mkexpr(rMt), mkU32(ARMG_CC_MASK_Q)));
-         put_QFLAG32(qnewT, condT);
-         DIP("msr%s cpsr_f, r%u\n", nCC(INSN_COND), rM);
+   // MSR apsr, reg
+   if (INSN(27,20) == BITS8(0,0,0,1,0,0,1,0) 
+       && INSN(17,12) == BITS6(0,0,1,1,1,1)
+       && INSN(11,4) == BITS8(0,0,0,0,0,0,0,0)) {
+      UInt rN          = INSN(3,0);
+      UInt write_ge    = INSN(18,18);
+      UInt write_nzcvq = INSN(19,19);
+      if (rN != 15 && (write_nzcvq || write_ge)) {
+         IRTemp rNt = newTemp(Ity_I32);
+         assign(rNt, getIRegA(rN));
+         desynthesise_APSR( write_nzcvq, write_ge, rNt, condT );
+         DIP("msr%s cpsr_%s%s, r%u\n", nCC(INSN_COND),
+             write_nzcvq ? "f" : "", write_ge ? "g" : "", rN);
           goto decode_success;
        }
        /* fall through */
     }
  
     // MRS rD, cpsr
-   if (BITS8(0,0,0,1,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
-       && INSN(19,16) == BITS4(1,1,1,1)
-       && INSN(11,0) == 0) {
-      UInt bitR = (insn >> 22) & 1;
+   if ((insn & 0x0FFF0FFF) == 0x010F0000) {
        UInt rD   = INSN(15,12);
-      if (bitR == 0 && rD != 15) {
-         IRTemp res1 = newTemp(Ity_I32);
-         // Get NZCV
-         assign( res1, mk_armg_calculate_flags_nzcv() );
-         /// OR in the Q value
-         IRTemp res2 = newTemp(Ity_I32);
-         assign(
-            res2,
-            binop(Iop_Or32,
-                  mkexpr(res1),
-                  binop(Iop_Shl32,
-                        unop(Iop_1Uto32,
-                             binop(Iop_CmpNE32,
-                                   mkexpr(get_QFLAG32()),
-                                   mkU32(0))),
-                        mkU8(ARMG_CC_SHIFT_Q)))
-         );
-         putIRegA( rD, mkexpr(res2), condT, Ijk_Boring );
+      if (rD != 15) {
+         IRTemp apsr = synthesise_APSR();
+         putIRegA( rD, mkexpr(apsr), condT, Ijk_Boring );
           DIP("mrs%s r%u, cpsr\n", nCC(INSN_COND), rD);
           goto decode_success;
        }
@@ -10991,7 +13376,7 @@ DisResult disInstr_ARM_WRK (
     if (0x01900F9F == (insn & 0x0FF00FFF)) {
        UInt rT = INSN(15,12);
        UInt rN = INSN(19,16);
-      if (rT == 15 || rN == 15 || rT == 14 /* || (rT & 1)*/) {
+      if (rT == 15 || rN == 15) {
           /* undecodable; fall through */
        } else {
           IRTemp res;
@@ -11017,8 +13402,7 @@ DisResult disInstr_ARM_WRK (
        UInt rN = INSN(19,16);
        UInt rD = INSN(15,12);
        if (rT == 15 || rN == 15 || rD == 15
-          || rT == 14 /* || (rT & 1)*/
-          || rD == rT || rN == rT) {
+          || rD == rT || rD == rN) {
           /* undecodable; fall through */
        } else {
           IRTemp resSC1, resSC32;
@@ -11073,7 +13457,9 @@ DisResult disInstr_ARM_WRK (
        /* fall through */
     }
  
-   /* ------------------- {u,s}xt{b,h}{,16} ------------------- */
+   /* ----------- uxtb, sxtb, uxth, sxth, uxtb16, sxtb16 ----------- */
+   /* FIXME: this is an exact duplicate of the Thumb version.  They
+      should be commoned up. */
     if (BITS8(0,1,1,0,1, 0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,1,0,0,0))
         && BITS4(1,1,1,1) == INSN(19,16)
         && BITS4(0,1,1,1) == INSN(7,4)
@@ -11127,7 +13513,7 @@ DisResult disInstr_ARM_WRK (
                                     unop(Iop_32to8, mkexpr(hi32))),
                                mkU8(16))
                 ));
-               nm = "uxtb16";
+               nm = "sxtb16";
                 break;
              }
              default:
@@ -11221,45 +13607,6 @@ DisResult disInstr_ARM_WRK (
        /* fall through */
     }
  
-   /* ------------------- smul{b,t}{b,t} ------------- */
-   if (BITS8(0,0,0,1,0,1,1,0) == INSN(27,20)
-       && BITS4(0,0,0,0) == INSN(15,12)
-       && BITS4(1,0,0,0) == (INSN(7,4) & BITS4(1,0,0,1))) {
-      UInt rD  = INSN(19,16);
-      UInt rM  = INSN(11,8);
-      UInt rN  = INSN(3,0);
-      UInt bM = (insn >> 6) & 1;
-      UInt bN = (insn >> 5) & 1;
-      if (bN == 0 && bM == 1) goto decode_failure; //ATC
-      if (bN == 1 && bM == 0) goto decode_failure; //ATC
-      if (bN == 1 && bM == 1) goto decode_failure; //ATC
-      if (rD == 15 || rN == 15 || rM == 15) {
-         /* undecodable; fall through */
-      } else {
-         IRTemp srcL = newTemp(Ity_I32);
-         IRTemp srcR = newTemp(Ity_I32);
-         IRTemp res  = newTemp(Ity_I32);
-
-         /* Extract and sign extend the two 16-bit operands */
-         assign(srcL, binop(Iop_Sar32,
-                            binop(Iop_Shl32, getIRegA(rN),
-                                             mkU8(bN ? 0 : 16)),
-                            mkU8(16)));
-         assign(srcR, binop(Iop_Sar32,
-                            binop(Iop_Shl32, getIRegA(rM),
-                                             mkU8(bM ? 0 : 16)),
-                            mkU8(16)));
-
-         assign(res, binop(Iop_Mul32, mkexpr(srcL), mkexpr(srcR)));
-         putIRegA(rD, mkexpr(res), condT, Ijk_Boring);
-
-         DIP("smul%c%c%s r%u, r%u, r%u\n",
-             bN ? 't' : 'b', bM ? 't' : 'b', nCC(INSN_COND), rD, rN, rM);
-         goto decode_success;
-      }
-      /* fall through */
-   }
-
     /* --------------------- Load/store doubleword ------------- */
     // LDRD STRD
     /*                 31   27   23   19 15 11   7    3     # highest bit
@@ -11306,7 +13653,6 @@ DisResult disInstr_ARM_WRK (
     }
     else if (INSN(27,24) == BITS4(0,0,0,0) && INSN(22,20) == BITS3(0,0,0)) {
        summary = 3 | 32;
-      goto decode_failure; //ATC
     }
     else goto after_load_store_doubleword;
  
@@ -11499,6 +13845,64 @@ DisResult disInstr_ARM_WRK (
        /* fall through */
     }
  
+   /* ------------------- rev16, rev ------------------ */
+   if (INSN(27,16) == 0x6BF
+       && (INSN(11,4) == 0xFB/*rev16*/ || INSN(11,4) == 0xF3/*rev*/)) {
+      Bool isREV = INSN(11,4) == 0xF3;
+      UInt rM    = INSN(3,0);
+      UInt rD    = INSN(15,12);
+      if (rM != 15 && rD != 15) {
+         IRTemp rMt = newTemp(Ity_I32);
+         assign(rMt, getIRegA(rM));
+         IRTemp res = isREV ? gen_REV(rMt) : gen_REV16(rMt);
+         putIRegA(rD, mkexpr(res), condT, Ijk_Boring);
+         DIP("rev%s%s r%u, r%u\n", isREV ? "" : "16",
+             nCC(INSN_COND), rD, rM);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- rbit ------------------ */
+   if (INSN(27,16) == 0x6FF && INSN(11,4) == 0xF3) {
+      UInt rD = INSN(15,12);
+      UInt rM = INSN(3,0);
+      if (rD != 15 && rM != 15) {
+         IRTemp arg = newTemp(Ity_I32);
+         assign(arg, getIRegA(rM));
+         IRTemp res = gen_BITREV(arg);
+         putIRegA(rD, mkexpr(res), condT, Ijk_Boring);
+         DIP("rbit r%u, r%u\n", rD, rM);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- smmul ------------------ */
+   if (INSN(27,20) == BITS8(0,1,1,1,0,1,0,1)
+       && INSN(15,12) == BITS4(1,1,1,1)
+       && (INSN(7,4) & BITS4(1,1,0,1)) == BITS4(0,0,0,1)) {
+      UInt bitR = INSN(5,5);
+      UInt rD = INSN(19,16);
+      UInt rM = INSN(11,8);
+      UInt rN = INSN(3,0);
+      if (rD != 15 && rM != 15 && rN != 15) {
+         IRExpr* res
+         = unop(Iop_64HIto32,
+                binop(Iop_Add64,
+                      binop(Iop_MullS32, getIRegA(rN), getIRegA(rM)),
+                      mkU64(bitR ? 0x80000000ULL : 0ULL)));
+         putIRegA(rD, res, condT, Ijk_Boring);
+         DIP("smmul%s%s r%u, r%u, r%u\n",
+             nCC(INSN_COND), bitR ? "r" : "", rD, rN, rM);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- NOP ------------------ */
+   if (0x0320F000 == (insn & 0x0FFFFFFF)) {
+      DIP("nop%s\n", nCC(INSN_COND));
+      goto decode_success;
+   }
+
     /* ----------------------------------------------------------- */
     /* -- ARMv7 instructions                                    -- */
     /* ----------------------------------------------------------- */
@@ -11573,6 +13977,18 @@ DisResult disInstr_ARM_WRK (
     /* These are all in NV space, and so are taken care of (far) above,
        by a call from this function to decode_NV_instruction(). */
  
+   /* ----------------------------------------------------------- */
+   /* -- v6 media instructions (in ARM mode)                   -- */
+   /* ----------------------------------------------------------- */
+
+   { Bool ok_v6m = decode_V6MEDIA_instruction(
+                       &dres, INSN(27,0), condT, INSN_COND,
+                       False/*!isT*/
+                   );
+     if (ok_v6m)
+        goto decode_success;
+   }
+
     /* ----------------------------------------------------------- */
     /* -- Undecodable                                           -- */
     /* ----------------------------------------------------------- */
@@ -11625,7 +14041,7 @@ DisResult disInstr_ARM_WRK (
           assert here. */
        vassert(dres.whatNext == Dis_Continue);
        vassert(irsb->next == NULL);
-      vassert(irsb->jumpkind = Ijk_Boring);
+      vassert(irsb->jumpkind == Ijk_Boring);
        /* If r15 is unconditionally written, terminate the block by
           jumping to it.  If it's conditionally written, still
           terminate the block (a shame, but we can't do side exits to
@@ -11695,6 +14111,10 @@ DisResult disInstr_THUMB_WRK (
     //UInt      hwcaps = archinfo->hwcaps;
     HChar     dis_buf[128];  // big enough to hold LDMIA etc text
  
+   /* Summary result of the ITxxx backwards analysis: False == safe
+      but suboptimal. */
+   Bool guaranteedUnconditional = False;
+
     /* What insn variants are we supporting today? */
     //allow_VFP  = (0 != (hwcaps & VEX_HWCAPS_ARM_VFP));
     // etc etc
@@ -11795,7 +14215,7 @@ DisResult disInstr_THUMB_WRK (
        the instruction word, first for 16-bit insns, then for 32-bit
        insns. */
  
-   /* --- BEGIN optimisation --- */
+   /* --- BEGIN ITxxx optimisation analysis --- */
     /* This is a crucial optimisation for the ITState boilerplate that
        follows.  Examine the 9 halfwords preceding this instruction,
        and if we are absolutely sure that none of them constitute an
@@ -11834,7 +14254,7 @@ DisResult disInstr_THUMB_WRK (
     {
        /* Summary result of this analysis: False == safe but
           suboptimal. */
-      Bool forceZ = False;
+      vassert(guaranteedUnconditional == False);
  
        UInt pc = guest_R15_curr_instr_notENC;
        vassert(0 == (pc & 1));
@@ -11843,7 +14263,7 @@ DisResult disInstr_THUMB_WRK (
        if (pageoff >= 18) {
           /* It's safe to poke about in the 9 halfwords preceding this
              insn.  So, have a look at them. */
-         forceZ = True; /* assume no 'it' insn found, till we do */
+         guaranteedUnconditional = True; /* assume no 'it' insn found, till we do */
  
           UShort* hwp = (UShort*)(HWord)pc;
           Int i;
@@ -11858,19 +14278,13 @@ DisResult disInstr_THUMB_WRK (
                 where x can be anything, but y must be nonzero. */
              if ((hwp[i] & 0xFF00) == 0xBF00 && (hwp[i] & 0xF) != 0) {
                 /* might be an 'it' insn.  Play safe. */
-               forceZ = False;
+               guaranteedUnconditional = False;
                 break;
              }
           }
        }
-      /* So, did we get lucky? */
-      if (forceZ) {
-         IRTemp t = newTemp(Ity_I32);
-         assign(t, mkU32(0));
-         put_ITSTATE(t);
-      }
     }
-   /* --- END optimisation --- */
+   /* --- END ITxxx optimisation analysis --- */
  
     /* Generate the guarding condition for this insn, by examining
        ITSTATE.  Assign it to condT.  Also, generate new
@@ -11881,74 +14295,177 @@ DisResult disInstr_THUMB_WRK (
        decode_success handle this, but in cases where the insn contains
        a side exit, we have to update them before the exit. */
  
-   IRTemp old_itstate = get_ITSTATE();
-
-   IRTemp new_itstate = newTemp(Ity_I32);
-   assign(new_itstate, binop(Iop_Shr32, mkexpr(old_itstate), mkU8(8)));
+   /* If the ITxxx optimisation analysis above could not prove that
+      this instruction is guaranteed unconditional, we insert a
+      lengthy IR preamble to compute the guarding condition at
+      runtime.  If it can prove it (which obviously we hope is the
+      normal case) then we insert a minimal preamble, which is
+      equivalent to setting guest_ITSTATE to zero and then folding
+      that through the full preamble (which completely disappears). */
+
+   IRTemp condT              = IRTemp_INVALID;
+   IRTemp old_itstate        = IRTemp_INVALID;
+   IRTemp new_itstate        = IRTemp_INVALID;
+   IRTemp cond_AND_notInIT_T = IRTemp_INVALID;
+
+   if (guaranteedUnconditional) {
+      /* BEGIN "partial eval { ITSTATE = 0; STANDARD_PREAMBLE; }" */
+
+      // ITSTATE = 0 :: I32
+      IRTemp z32 = newTemp(Ity_I32);
+      assign(z32, mkU32(0));
+      put_ITSTATE(z32);
+
+      // old_itstate = 0 :: I32
+      //
+      // old_itstate = get_ITSTATE();
+      old_itstate = z32; /* 0 :: I32 */
+
+      // new_itstate = old_itstate >> 8
+      //             = 0 >> 8
+      //             = 0 :: I32
+      //
+      // new_itstate = newTemp(Ity_I32);
+      // assign(new_itstate,
+      //        binop(Iop_Shr32, mkexpr(old_itstate), mkU8(8)));
+      new_itstate = z32;
+
+      // ITSTATE = 0 :: I32(again)
+      //
+      // put_ITSTATE(new_itstate);
+
+      // condT1 = calc_cond_dyn( xor(and(old_istate,0xF0), 0xE0) )
+      //        = calc_cond_dyn( xor(0,0xE0) )
+      //        = calc_cond_dyn ( 0xE0 )
+      //        = 1 :: I32
+      // Not that this matters, since the computed value is not used:
+      // see condT folding below
+      //
+      // IRTemp condT1 = newTemp(Ity_I32);
+      // assign(condT1,
+      //        mk_armg_calculate_condition_dyn(
+      //           binop(Iop_Xor32,
+      //                 binop(Iop_And32, mkexpr(old_itstate), mkU32(0xF0)),
+      //                 mkU32(0xE0))
+      //       )
+      // );
+
+      // condT = 32to8(and32(old_itstate,0xF0)) == 0  ? 1  : condT1
+      //       = 32to8(and32(0,0xF0)) == 0  ? 1  : condT1
+      //       = 32to8(0) == 0  ? 1  : condT1
+      //       = 0 == 0  ? 1  : condT1
+      //       = 1
+      //
+      // condT = newTemp(Ity_I32);
+      // assign(condT, IRExpr_Mux0X(
+      //                  unop(Iop_32to8, binop(Iop_And32,
+      //                                        mkexpr(old_itstate),
+      //                                        mkU32(0xF0))),
+      //                  mkU32(1),
+      //                  mkexpr(condT1)
+      //       ));
+      condT = newTemp(Ity_I32);
+      assign(condT, mkU32(1));
+
+      // notInITt = xor32(and32(old_itstate, 1), 1)
+      //          = xor32(and32(0, 1), 1)
+      //          = xor32(0, 1)
+      //          = 1 :: I32
+      //
+      // IRTemp notInITt = newTemp(Ity_I32);
+      // assign(notInITt,
+      //        binop(Iop_Xor32,
+      //              binop(Iop_And32, mkexpr(old_itstate), mkU32(1)),
+      //              mkU32(1)));
+
+      // cond_AND_notInIT_T = and32(notInITt, condT)
+      //                    = and32(1, 1)
+      //                    = 1
+      //
+      // cond_AND_notInIT_T = newTemp(Ity_I32);
+      // assign(cond_AND_notInIT_T,
+      //        binop(Iop_And32, mkexpr(notInITt), mkexpr(condT)));
+      cond_AND_notInIT_T = condT; /* 1 :: I32 */
+
+      /* END "partial eval { ITSTATE = 0; STANDARD_PREAMBLE; }" */
+   } else {
+      /* BEGIN { STANDARD PREAMBLE; } */
+
+      old_itstate = get_ITSTATE();
+
+      new_itstate = newTemp(Ity_I32);
+      assign(new_itstate,
+             binop(Iop_Shr32, mkexpr(old_itstate), mkU8(8)));
+
+      put_ITSTATE(new_itstate);
+
+      /* Same strategy as for ARM insns: generate a condition
+         temporary at this point (or IRTemp_INVALID, meaning
+         unconditional).  We leave it to lower-level instruction
+         decoders to decide whether they can generate straight-line
+         code, or whether they must generate a side exit before the
+         instruction.  condT :: Ity_I32 and is always either zero or
+         one. */
+      IRTemp condT1 = newTemp(Ity_I32);
+      assign(condT1,
+             mk_armg_calculate_condition_dyn(
+                binop(Iop_Xor32,
+                      binop(Iop_And32, mkexpr(old_itstate), mkU32(0xF0)),
+                      mkU32(0xE0))
+            )
+      );
  
-   put_ITSTATE(new_itstate);
+      /* This is a bit complex, but needed to make Memcheck understand
+         that, if the condition in old_itstate[7:4] denotes AL (that
+         is, if this instruction is to be executed unconditionally),
+         then condT does not depend on the results of calling the
+         helper.
+
+         We test explicitly for old_itstate[7:4] == AL ^ 0xE, and in
+         that case set condT directly to 1.  Else we use the results
+         of the helper.  Since old_itstate is always defined and
+         because Memcheck does lazy V-bit propagation through Mux0X,
+         this will cause condT to always be a defined 1 if the
+         condition is 'AL'.  From an execution semantics point of view
+         this is irrelevant since we're merely duplicating part of the
+         behaviour of the helper.  But it makes it clear to Memcheck,
+         in this case, that condT does not in fact depend on the
+         contents of the condition code thunk.  Without it, we get
+         quite a lot of false errors.
+
+         So, just to clarify: from a straight semantics point of view,
+         we can simply do "assign(condT, mkexpr(condT1))", and the
+         simulator still runs fine.  It's just that we get loads of
+         false errors from Memcheck. */
+      condT = newTemp(Ity_I32);
+      assign(condT, IRExpr_Mux0X(
+                       unop(Iop_32to8, binop(Iop_And32,
+                                             mkexpr(old_itstate),
+                                             mkU32(0xF0))),
+                       mkU32(1),
+                       mkexpr(condT1)
+            ));
  
-   /* Same strategy as for ARM insns: generate a condition temporary
-      at this point  (or IRTemp_INVALID, meaning
-      unconditional).  We leave it to lower-level instruction decoders
-      to decide whether they can generate straight-line code, or
-      whether they must generate a side exit before the instruction.
-      condT :: Ity_I32 and is always either zero or one. */
-   IRTemp condT1 = newTemp(Ity_I32);
-   assign(condT1,
-          mk_armg_calculate_condition_dyn(
+      /* Something we don't have in ARM: generate a 0 or 1 value
+         indicating whether or not we are in an IT block (NB: 0 = in
+         IT block, 1 = not in IT block).  This is used to gate
+         condition code updates in 16-bit Thumb instructions. */
+      IRTemp notInITt = newTemp(Ity_I32);
+      assign(notInITt,
               binop(Iop_Xor32,
-                   binop(Iop_And32, mkexpr(old_itstate), mkU32(0xF0)),
-                   mkU32(0xE0))
-          )
-   );
+                   binop(Iop_And32, mkexpr(old_itstate), mkU32(1)),
+                   mkU32(1)));
  
-   /* This is a bit complex, but needed to make Memcheck understand
-      that, if the condition in old_itstate[7:4] denotes AL (that is,
-      if this instruction is to be executed unconditionally), then
-      condT does not depend on the results of calling the helper.
-
-      We test explicitly for old_itstate[7:4] == AL ^ 0xE, and in that
-      case set condT directly to 1.  Else we use the results of the
-      helper.  Since old_itstate is always defined and because
-      Memcheck does lazy V-bit propagation through Mux0X, this will
-      cause condT to always be a defined 1 if the condition is 'AL'.
-      From an execution semantics point of view this is irrelevant
-      since we're merely duplicating part of the behaviour of the
-      helper.  But it makes it clear to Memcheck, in this case, that
-      condT does not in fact depend on the contents of the condition
-      code thunk.  Without it, we get quite a lot of false errors.
-
-      So, just to clarify: from a straight semantics point of view, we
-      can simply do "assign(condT, mkexpr(condT1))", and the simulator
-      still runs fine.  It's just that we get loads of false errors
-      from Memcheck. */
-   IRTemp condT = newTemp(Ity_I32);
-   assign(condT, IRExpr_Mux0X(
-                    unop(Iop_32to8, binop(Iop_And32,
-                                          mkexpr(old_itstate),
-                                          mkU32(0xF0))),
-                    mkU32(1),
-                    mkexpr(condT1)
-         ));
+      /* Compute 'condT && notInITt' -- that is, the instruction is
+         going to execute, and we're not in an IT block.  This is the
+         gating condition for updating condition codes in 16-bit Thumb
+         instructions, except for CMP, CMN and TST. */
+      cond_AND_notInIT_T = newTemp(Ity_I32);
+      assign(cond_AND_notInIT_T,
+             binop(Iop_And32, mkexpr(notInITt), mkexpr(condT)));
+      /* END { STANDARD PREAMBLE; } */
+   }
  
-   /* Something we don't have in ARM: generate a 0 or 1 value
-      indicating whether or not we are in an IT block (NB: 0 = in IT
-      block, 1 = not in IT block).  This is used to gate condition
-      code updates in 16-bit Thumb instructions. */
-   IRTemp notInITt = newTemp(Ity_I32);
-   assign(notInITt,
-          binop(Iop_Xor32,
-                binop(Iop_And32, mkexpr(old_itstate), mkU32(1)),
-                mkU32(1)));
-
-   /* Compute 'condT && notInITt' -- that is, the instruction is going
-      to execute, and we're not in an IT block.  This is the gating
-      condition for updating condition codes in 16-bit Thumb
-      instructions, except for CMP, CMN and TST. */
-   IRTemp cond_AND_notInIT_T = newTemp(Ity_I32);
-   assign(cond_AND_notInIT_T,
-          binop(Iop_And32, mkexpr(notInITt), mkexpr(condT)));
  
     /* At this point:
        * ITSTATE has been updated
@@ -12260,6 +14777,21 @@ DisResult disInstr_THUMB_WRK (
        goto decode_success;
     }
  
+   case 0x2E8:   // REV
+   case 0x2E9: { // REV16
+      /* ---------------- REV   Rd, Rm ---------------- */
+      /* ---------------- REV16 Rd, Rm ---------------- */
+      UInt rM = INSN0(5,3);
+      UInt rD = INSN0(2,0);
+      Bool isREV = INSN0(15,6) == 0x2E8;
+      IRTemp arg = newTemp(Ity_I32);
+      assign(arg, getIRegT(rM));
+      IRTemp res = isREV ? gen_REV(arg) : gen_REV16(arg);
+      putIRegT(rD, mkexpr(res), condT);
+      DIP("rev%s r%u, r%u\n", isREV ? "" : "16", rD, rM);
+      goto decode_success;
+   }
+
     default:
        break; /* examine the next shortest prefix */
  
@@ -12294,7 +14826,7 @@ DisResult disInstr_THUMB_WRK (
           suitably encoded address therefore (w CPSR.T at the bottom).
           Have to special-case r15, as usual. */
        UInt rM = (INSN0(6,6) << 3) | INSN0(5,3);
-      if (BITS3(0,0,0) == INSN0(2,0) &&/*atc*/rM != 15) {
+      if (BITS3(0,0,0) == INSN0(2,0)) {
           IRTemp dst = newTemp(Ity_I32);
           gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
           mk_skip_over_T16_if_cond_is_false(condT);
@@ -12303,7 +14835,6 @@ DisResult disInstr_THUMB_WRK (
           if (rM <= 14) {
              assign( dst, getIRegT(rM) );
           } else {
-            break; // ATC
              vassert(rM == 15);
              assign( dst, mkU32(guest_R15_curr_instr_notENC + 4) );
           }
@@ -12332,7 +14863,7 @@ DisResult disInstr_THUMB_WRK (
              putIRegT( 14, mkU32( (guest_R15_curr_instr_notENC + 2) | 1 ),
                            IRTemp_INVALID );
              irsb->next     = mkexpr(dst);
-            irsb->jumpkind = Ijk_Boring;
+            irsb->jumpkind = Ijk_Call;
              dres.whatNext  = Dis_StopHere;
              DIP("blx r%u (possibly switch to ARM mode)\n", rM);
              goto decode_success;
@@ -13493,8 +16024,9 @@ DisResult disInstr_THUMB_WRK (
        UInt rN    = INSN0(3,0);
        UInt rD    = INSN1(11,8);
        Bool valid = !isBadRegT(rN) && !isBadRegT(rD);
-      /* but allow "sub.w sp, sp, #constT" */
-      if (!valid && !isRSB && rN == 13 && rD == 13)
+      /* but allow "sub{s}.w reg, sp, #constT 
+         this is (T2) of "SUB (SP minus immediate)" */
+      if (!valid && !isRSB && rN == 13 && rD != 15)
           valid = True;
        if (valid) {
           IRTemp argL  = newTemp(Ity_I32);
@@ -13580,21 +16112,24 @@ DisResult disInstr_THUMB_WRK (
         && (   INSN0(9,5) == BITS5(0,0,0,1,0)  // ORR
             || INSN0(9,5) == BITS5(0,0,0,0,0)  // AND
             || INSN0(9,5) == BITS5(0,0,0,0,1)  // BIC
-           || INSN0(9,5) == BITS5(0,0,1,0,0)) // EOR
+           || INSN0(9,5) == BITS5(0,0,1,0,0)  // EOR
+           || INSN0(9,5) == BITS5(0,0,0,1,1)) // ORN
         && INSN1(15,15) == 0) {
        UInt bS = INSN0(4,4);
        UInt rN = INSN0(3,0);
        UInt rD = INSN1(11,8);
        if (!isBadRegT(rN) && !isBadRegT(rD)) {
-         Bool   isBIC = False;
-         IROp   op    = Iop_INVALID;
-         HChar* nm    = "???";
+         Bool   notArgR = False;
+         IROp   op      = Iop_INVALID;
+         HChar* nm      = "???";
           switch (INSN0(9,5)) {
              case BITS5(0,0,0,1,0): op = Iop_Or32;  nm = "orr"; break;
              case BITS5(0,0,0,0,0): op = Iop_And32; nm = "and"; break;
              case BITS5(0,0,0,0,1): op = Iop_And32; nm = "bic";
-                                   isBIC = True; break;
-            case BITS5(0,0,1,0,0): op = Iop_Xor32;  nm = "eor"; break;
+                                   notArgR = True; break;
+            case BITS5(0,0,1,0,0): op = Iop_Xor32; nm = "eor"; break;
+            case BITS5(0,0,0,1,1): op = Iop_Or32;  nm = "orn";
+                                   notArgR = True; break;
              default: vassert(0);
           }
           IRTemp argL  = newTemp(Ity_I32);
@@ -13603,7 +16138,7 @@ DisResult disInstr_THUMB_WRK (
           Bool   updC  = False;
           UInt   imm32 = thumbExpandImm_from_I0_I1(&updC, insn0, insn1);
           assign(argL, getIRegT(rN));
-         assign(argR, mkU32(isBIC ? ~imm32 : imm32));
+         assign(argR, mkU32(notArgR ? ~imm32 : imm32));
           assign(res,  binop(op, mkexpr(argL), mkexpr(argR)));
           putIRegT(rD, mkexpr(res), condT);
           if (bS) {
@@ -13638,14 +16173,16 @@ DisResult disInstr_THUMB_WRK (
        UInt how  = INSN1(5,4);
  
        Bool valid = !isBadRegT(rD) && !isBadRegT(rN) && !isBadRegT(rM);
-      /* but allow "add.w reg, sp, reg   w/ no shift */
+      /* but allow "add.w reg, sp, reg   w/ no shift
+         (T3) "ADD (SP plus register) */
        if (!valid && INSN0(8,5) == BITS4(1,0,0,0) // add
-          && rN == 13 && imm5 == 0 && how == 0) {
+          && rD != 15 && rN == 13 && imm5 == 0 && how == 0) {
           valid = True;
        }
-      /* also allow "sub.w sp, sp, reg   w/ no shift */
-      if (!valid && INSN0(8,5) == BITS4(1,1,0,1) // add
-          && rD == 13 && rN == 13 && imm5 == 0 && how == 0) {
+      /* also allow "sub.w reg, sp, reg   w/ no shift
+         (T1) "SUB (SP minus register) */
+      if (!valid && INSN0(8,5) == BITS4(1,1,0,1) // sub
+          && rD != 15 && rN == 13 && imm5 == 0 && how == 0) {
           valid = True;
        }
        if (valid) {
@@ -13768,25 +16305,29 @@ DisResult disInstr_THUMB_WRK (
     /* ---------- (T3) ORR{S}.W Rd, Rn, Rm, {shift} ---------- */
     /* ---------- (T3) EOR{S}.W Rd, Rn, Rm, {shift} ---------- */
     /* ---------- (T3) BIC{S}.W Rd, Rn, Rm, {shift} ---------- */
+   /* ---------- (T1) ORN{S}.W Rd, Rn, Rm, {shift} ---------- */
     if (INSN0(15,9) == BITS7(1,1,1,0,1,0,1)
         && (   INSN0(8,5) == BITS4(0,0,0,0)  // and subopc
             || INSN0(8,5) == BITS4(0,0,1,0)  // orr subopc
             || INSN0(8,5) == BITS4(0,1,0,0)  // eor subopc
-           || INSN0(8,5) == BITS4(0,0,0,1)) // bic subopc
+           || INSN0(8,5) == BITS4(0,0,0,1)  // bic subopc
+           || INSN0(8,5) == BITS4(0,0,1,1)) // orn subopc
         && INSN1(15,15) == 0) {
        UInt rN = INSN0(3,0);
        UInt rD = INSN1(11,8);
        UInt rM = INSN1(3,0);
        if (!isBadRegT(rD) && !isBadRegT(rN) && !isBadRegT(rM)) {
-         Bool isBIC = False;
-         IROp op    = Iop_INVALID;
+         Bool notArgR = False;
+         IROp op      = Iop_INVALID;
           HChar* nm  = "???";
           switch (INSN0(8,5)) {
              case BITS4(0,0,0,0): op = Iop_And32; nm = "and"; break;
              case BITS4(0,0,1,0): op = Iop_Or32;  nm = "orr"; break;
              case BITS4(0,1,0,0): op = Iop_Xor32; nm = "eor"; break;
              case BITS4(0,0,0,1): op = Iop_And32; nm = "bic";
-                                 isBIC = True; break;
+                                 notArgR = True; break;
+            case BITS4(0,0,1,1): op = Iop_Or32; nm = "orn";
+                                 notArgR = True; break;
              default: vassert(0);
           }
           UInt bS   = INSN0(4,4);
@@ -13807,8 +16348,8 @@ DisResult disInstr_THUMB_WRK (
           );
  
           IRTemp res = newTemp(Ity_I32);
-         if (isBIC) {
-            vassert(op == Iop_And32);
+         if (notArgR) {
+            vassert(op == Iop_And32 || op == Iop_Or32);
              assign(res, binop(op, mkexpr(rNt),
                                    unop(Iop_Not32, mkexpr(argR))));
           } else {
@@ -14749,10 +17290,16 @@ DisResult disInstr_THUMB_WRK (
     /* ------------------ UXTH ------------------ */
     /* ------------------ SXTB ------------------ */
     /* ------------------ SXTH ------------------ */
+   /* ----------------- UXTB16 ----------------- */
+   /* ----------------- SXTB16 ----------------- */
+   /* FIXME: this is an exact duplicate of the ARM version.  They
+      should be commoned up. */
     if ((INSN0(15,0) == 0xFA5F     // UXTB
          || INSN0(15,0) == 0xFA1F  // UXTH
          || INSN0(15,0) == 0xFA4F  // SXTB
-        || INSN0(15,0) == 0xFA0F) // SXTH
+        || INSN0(15,0) == 0xFA0F  // SXTH
+        || INSN0(15,0) == 0xFA3F  // UXTB16
+        || INSN0(15,0) == 0xFA2F) // SXTB16
         && INSN1(15,12) == BITS4(1,1,1,1)
         && INSN1(7,6) == BITS2(1,0)) {
        UInt rD = INSN1(11,8);
@@ -14786,6 +17333,31 @@ DisResult disInstr_THUMB_WRK (
                 assign(dstT, unop(Iop_16Sto32,
                                   unop(Iop_32to16, mkexpr(rotT))));
                 break;
+            case 0xFA3F: // UXTB16
+               nm = "uxtb16";
+               assign(dstT, binop(Iop_And32, mkexpr(rotT),
+                                             mkU32(0x00FF00FF)));
+               break;
+            case 0xFA2F: { // SXTB16
+               nm = "sxtb16";
+               IRTemp lo32 = newTemp(Ity_I32);
+               IRTemp hi32 = newTemp(Ity_I32);
+               assign(lo32, binop(Iop_And32, mkexpr(rotT), mkU32(0xFF)));
+               assign(hi32, binop(Iop_Shr32, mkexpr(rotT), mkU8(16)));
+               assign(
+                  dstT,
+                  binop(Iop_Or32,
+                        binop(Iop_And32,
+                              unop(Iop_8Sto32,
+                                   unop(Iop_32to8, mkexpr(lo32))),
+                              mkU32(0xFFFF)),
+                        binop(Iop_Shl32,
+                              unop(Iop_8Sto32,
+                                   unop(Iop_32to8, mkexpr(hi32))),
+                              mkU8(16))
+               ));
+               break;
+            }
              default:
                 vassert(0);
           }
@@ -14966,27 +17538,6 @@ DisResult disInstr_THUMB_WRK (
        }
     }
  
-   /* ------------------- (T1) SMULBB ------------------- */
-   if (INSN0(15,4) == 0xFB1
-       && INSN1(15,12) == BITS4(1,1,1,1)
-       && INSN1(7,6) == BITS2(0,0)
-       && INSN1(5,4) == BITS2(0,0)) { // other values -> BT/TB/TT
-      UInt rN = INSN0(3,0);
-      UInt rD = INSN1(11,8);
-      UInt rM = INSN1(3,0);
-      if (!isBadRegT(rD) && !isBadRegT(rN) && !isBadRegT(rM)) {
-         putIRegT(rD,
-                  binop(Iop_Mul32,
-                        unop(Iop_16Sto32,
-                             unop(Iop_32to16, getIRegT(rN))),
-                        unop(Iop_16Sto32,
-                             unop(Iop_32to16, getIRegT(rM)))),
-                  condT);
-         DIP("smulbb r%u, r%u, r%u\n", rD, rN, rM);
-         goto decode_success;
-      }
-   }
-
     /* ------------------- (T1) SXTAH ------------------- */
     /* ------------------- (T1) UXTAH ------------------- */
     if ((INSN0(15,4) == 0xFA1      // UXTAH
@@ -15050,7 +17601,7 @@ DisResult disInstr_THUMB_WRK (
         && INSN1(15,12) == BITS4(1,1,1,1)
         && INSN1(7,4) == BITS4(1,0,0,0)) {
        UInt rM1 = INSN0(3,0);
-      UInt rD = INSN1(11,8);
+      UInt rD  = INSN1(11,8);
        UInt rM2 = INSN1(3,0);
        if (!isBadRegT(rD) && !isBadRegT(rM1) && rM1 == rM2) {
           IRTemp arg = newTemp(Ity_I32);
@@ -15069,25 +17620,55 @@ DisResult disInstr_THUMB_WRK (
        }
     }
  
+   /* ------------------- (T1) RBIT ------------------- */
+   if (INSN0(15,4) == 0xFA9
+       && INSN1(15,12) == BITS4(1,1,1,1)
+       && INSN1(7,4) == BITS4(1,0,1,0)) {
+      UInt rM1 = INSN0(3,0);
+      UInt rD  = INSN1(11,8);
+      UInt rM2 = INSN1(3,0);
+      if (!isBadRegT(rD) && !isBadRegT(rM1) && rM1 == rM2) {
+         IRTemp arg = newTemp(Ity_I32);
+         assign(arg, getIRegT(rM1));
+         IRTemp res = gen_BITREV(arg);
+         putIRegT(rD, mkexpr(res), condT);
+         DIP("rbit r%u, r%u\n", rD, rM1);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- (T2) REV   ------------------- */
+   /* ------------------- (T2) REV16 ------------------- */
+   if (INSN0(15,4) == 0xFA9
+       && INSN1(15,12) == BITS4(1,1,1,1)
+       && (   INSN1(7,4) == BITS4(1,0,0,0)     // REV
+           || INSN1(7,4) == BITS4(1,0,0,1))) { // REV16
+      UInt rM1   = INSN0(3,0);
+      UInt rD    = INSN1(11,8);
+      UInt rM2   = INSN1(3,0);
+      Bool isREV = INSN1(7,4) == BITS4(1,0,0,0);
+      if (!isBadRegT(rD) && !isBadRegT(rM1) && rM1 == rM2) {
+         IRTemp arg = newTemp(Ity_I32);
+         assign(arg, getIRegT(rM1));
+         IRTemp res = isREV ? gen_REV(arg) : gen_REV16(arg);
+         putIRegT(rD, mkexpr(res), condT);
+         DIP("rev%s r%u, r%u\n", isREV ? "" : "16", rD, rM1);
+         goto decode_success;
+      }
+   }
+
     /* -------------- (T1) MSR apsr, reg -------------- */
     if (INSN0(15,4) == 0xF38 
         && INSN1(15,12) == BITS4(1,0,0,0) && INSN1(9,0) == 0x000) {
        UInt rN          = INSN0(3,0);
        UInt write_ge    = INSN1(10,10);
        UInt write_nzcvq = INSN1(11,11);
-      if (!isBadRegT(rN) && write_nzcvq && !write_ge) {
+      if (!isBadRegT(rN) && (write_nzcvq || write_ge)) {
           IRTemp rNt = newTemp(Ity_I32);
           assign(rNt, getIRegT(rN));
-         // Do NZCV
-         IRTemp immT = newTemp(Ity_I32);
-         assign(immT, binop(Iop_And32, mkexpr(rNt), mkU32(0xF0000000)) );
-         setFlags_D1(ARMG_CC_OP_COPY, immT, condT);
-         // Do Q
-         IRTemp qnewT = newTemp(Ity_I32);
-         assign(qnewT, binop(Iop_And32, mkexpr(rNt), mkU32(ARMG_CC_MASK_Q)));
-         put_QFLAG32(qnewT, condT);
-         // 
-         DIP("msr cpsr_f, r%u\n", rN);
+         desynthesise_APSR( write_nzcvq, write_ge, rNt, condT );
+         DIP("msr cpsr_%s%s, r%u\n",
+             write_nzcvq ? "f" : "", write_ge ? "g" : "", rN);
           goto decode_success;
        }
     }
@@ -15097,28 +17678,129 @@ DisResult disInstr_THUMB_WRK (
         && INSN1(15,12) == BITS4(1,0,0,0) && INSN1(7,0) == 0x00) {
        UInt rD = INSN1(11,8);
        if (!isBadRegT(rD)) {
-         IRTemp res1 = newTemp(Ity_I32);
-         // Get NZCV
-         assign( res1, mk_armg_calculate_flags_nzcv() );
-         /// OR in the Q value
-         IRTemp res2 = newTemp(Ity_I32);
-         assign(
-            res2,
-            binop(Iop_Or32,
-                  mkexpr(res1),
-                  binop(Iop_Shl32,
-                        unop(Iop_1Uto32,
-                             binop(Iop_CmpNE32,
-                                   mkexpr(get_QFLAG32()),
-                                   mkU32(0))),
-                        mkU8(ARMG_CC_SHIFT_Q)))
-         );
-         putIRegT( rD, mkexpr(res2), condT );
+         IRTemp apsr = synthesise_APSR();
+         putIRegT( rD, mkexpr(apsr), condT );
           DIP("mrs r%u, cpsr\n", rD);
           goto decode_success;
        }
     }
  
+   /* ----------------- (T1) LDREX ----------------- */
+   if (INSN0(15,4) == 0xE85 && INSN1(11,8) == BITS4(1,1,1,1)) {
+      UInt rN   = INSN0(3,0);
+      UInt rT   = INSN1(15,12);
+      UInt imm8 = INSN1(7,0);
+      if (!isBadRegT(rT) && rN != 15) {
+         IRTemp res;
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false( condT );
+         // now uncond
+         res = newTemp(Ity_I32);
+         stmt( IRStmt_LLSC(Iend_LE,
+                           res,
+                           binop(Iop_Add32, getIRegT(rN), mkU32(imm8 * 4)),
+                           NULL/*this is a load*/ ));
+         putIRegT(rT, mkexpr(res), IRTemp_INVALID);
+         DIP("ldrex r%u, [r%u, #+%u]\n", rT, rN, imm8 * 4);
+         goto decode_success;
+      }
+   }
+
+   /* ----------------- (T1) STREX ----------------- */
+   if (INSN0(15,4) == 0xE84) {
+      UInt rN   = INSN0(3,0);
+      UInt rT   = INSN1(15,12);
+      UInt rD   = INSN1(11,8);
+      UInt imm8 = INSN1(7,0);
+      if (!isBadRegT(rD) && !isBadRegT(rT) && rN != 15 
+          && rD != rN && rD != rT) {
+         IRTemp resSC1, resSC32;
+
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false( condT );
+         // now uncond
+
+         /* Ok, now we're unconditional.  Do the store. */
+         resSC1 = newTemp(Ity_I1);
+         stmt( IRStmt_LLSC(Iend_LE,
+                           resSC1,
+                           binop(Iop_Add32, getIRegT(rN), mkU32(imm8 * 4)),
+                           getIRegT(rT)) );
+
+         /* Set rD to 1 on failure, 0 on success.  Currently we have
+            resSC1 == 0 on failure, 1 on success. */
+         resSC32 = newTemp(Ity_I32);
+         assign(resSC32,
+                unop(Iop_1Uto32, unop(Iop_Not1, mkexpr(resSC1))));
+
+         putIRegT(rD, mkexpr(resSC32), IRTemp_INVALID);
+         DIP("strex r%u, r%u, [r%u, #+%u]\n", rD, rT, rN, imm8 * 4);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- v7 barrier insns -------------- */
+   if (INSN0(15,0) == 0xF3BF && (INSN1(15,0) & 0xFF00) == 0x8F00) {
+      /* XXX this isn't really right, is it?  The generated IR does
+         them unconditionally.  I guess it doesn't matter since it
+         doesn't do any harm to do them even when the guarding
+         condition is false -- it's just a performance loss. */
+      switch (INSN1(7,0)) {
+         case 0x4F: /* DSB sy */
+         case 0x4E: /* DSB st */
+         case 0x4B: /* DSB ish */
+         case 0x4A: /* DSB ishst */
+         case 0x47: /* DSB nsh */
+         case 0x46: /* DSB nshst */
+         case 0x43: /* DSB osh */
+         case 0x42: /* DSB oshst */
+            stmt( IRStmt_MBE(Imbe_Fence) );
+            DIP("DSB\n");
+            goto decode_success;
+         case 0x5F: /* DMB sy */
+         case 0x5E: /* DMB st */
+         case 0x5B: /* DMB ish */
+         case 0x5A: /* DMB ishst */
+         case 0x57: /* DMB nsh */
+         case 0x56: /* DMB nshst */
+         case 0x53: /* DMB osh */
+         case 0x52: /* DMB oshst */
+            stmt( IRStmt_MBE(Imbe_Fence) );
+            DIP("DMB\n");
+            goto decode_success;
+         case 0x6F: /* ISB */
+            stmt( IRStmt_MBE(Imbe_Fence) );
+            DIP("ISB\n");
+            goto decode_success;
+         default:
+            break;
+      }
+   }
+
+   /* -------------- read CP15 TPIDRURO register ------------- */
+   /* mrc     p15, 0,  r0, c13, c0, 3  up to
+      mrc     p15, 0, r14, c13, c0, 3
+   */
+   /* I don't know whether this is really v7-only.  But anyway, we
+      have to support it since arm-linux uses TPIDRURO as a thread
+      state register. */
+   
+   if ((INSN0(15,0) == 0xEE1D) && (INSN1(11,0) == 0x0F70)) {
+      UInt rD = INSN1(15,12);
+      if (!isBadRegT(rD)) {
+         putIRegT(rD, IRExpr_Get(OFFB_TPIDRURO, Ity_I32), IRTemp_INVALID);
+         DIP("mrc p15,0, r%u, c13, c0, 3\n", rD);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   /* ------------------- NOP ------------------ */
+   if (INSN0(15,0) == 0xF3AF && INSN1(15,0) == 0x8000) {
+      DIP("nop\n");
+      goto decode_success;
+   }
+
     /* ----------------------------------------------------------- */
     /* -- VFP (CP 10, CP 11) instructions (in Thumb mode)       -- */
     /* ----------------------------------------------------------- */
@@ -15137,11 +17819,25 @@ DisResult disInstr_THUMB_WRK (
     /* -- NEON instructions (in Thumb mode)                     -- */
     /* ----------------------------------------------------------- */
  
+   if (archinfo->hwcaps & VEX_HWCAPS_ARM_NEON) {
+      UInt insn32 = (INSN0(15,0) << 16) | INSN1(15,0);
+      Bool ok_neon = decode_NEON_instruction(
+                        &dres, insn32, condT, True/*isT*/
+                     );
+      if (ok_neon)
+         goto decode_success;
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- v6 media instructions (in Thumb mode)                 -- */
+   /* ----------------------------------------------------------- */
+
     { UInt insn32 = (INSN0(15,0) << 16) | INSN1(15,0);
-     Bool ok_neon = decode_NEON_instruction(
-                       &dres, insn32, condT, True/*isT*/
-                    );
-     if (ok_neon)
+     Bool ok_v6m = decode_V6MEDIA_instruction(
+                      &dres, insn32, condT, ARMCondAL/*bogus*/,
+                      True/*isT*/
+                   );
+     if (ok_v6m)
          goto decode_success;
     }
  
@@ -15196,7 +17892,7 @@ DisResult disInstr_THUMB_WRK (
           assert here. */
        vassert(dres.whatNext == Dis_Continue);
        vassert(irsb->next == NULL);
-      vassert(irsb->jumpkind = Ijk_Boring);
+      vassert(irsb->jumpkind == Ijk_Boring);
        /* If r15 is unconditionally written, terminate the block by
           jumping to it.  If it's conditionally written, still
           terminate the block (a shame, but we can't do side exits to