KVM: x86 emulator: address size and operand size overrides are sticky
[safe/jmp/linux-2.6] / drivers / kvm / x86_emulate.c
index c191093..3be506a 100644 (file)
 #include <stdio.h>
 #include <stdint.h>
 #include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf( _f , ## _a )
+#define DPRINTF(_f, _a ...) printf(_f , ## _a)
 #else
 #include "kvm.h"
+#include "x86.h"
 #define DPRINTF(x...) do {} while (0)
 #endif
 #include "x86_emulate.h"
@@ -62,8 +63,9 @@
 /* Destination is only written; never read. */
 #define Mov         (1<<7)
 #define BitOp       (1<<8)
+#define MemAbs      (1<<9)      /* Memory operand is absolute displacement */
 
-static u8 opcode_table[256] = {
+static u16 opcode_table[256] = {
        /* 0x00 - 0x07 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -96,14 +98,14 @@ static u8 opcode_table[256] = {
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
        0, 0, 0, 0,
-       /* 0x40 - 0x4F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x40 - 0x47 */
+       DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+       /* 0x48 - 0x4F */
+       DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
        /* 0x50 - 0x57 */
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       SrcReg, SrcReg, SrcReg, SrcReg, SrcReg, SrcReg, SrcReg, SrcReg,
        /* 0x58 - 0x5F */
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
        /* 0x60 - 0x67 */
        0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
        0, 0, 0, 0,
@@ -129,8 +131,8 @@ static u8 opcode_table[256] = {
        /* 0x90 - 0x9F */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
        /* 0xA0 - 0xA7 */
-       ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
-       ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
+       ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
+       ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
        ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
        ByteOp | ImplicitOps, ImplicitOps,
        /* 0xA8 - 0xAF */
@@ -157,10 +159,10 @@ static u8 opcode_table[256] = {
        ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
        /* 0xF0 - 0xF7 */
        0, 0, 0, 0,
-       ImplicitOps, 0,
+       ImplicitOps, ImplicitOps,
        ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
        /* 0xF8 - 0xFF */
-       0, 0, 0, 0,
+       ImplicitOps, 0, ImplicitOps, ImplicitOps,
        0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
 };
 
@@ -285,21 +287,21 @@ static u16 twobyte_table[256] = {
                switch ((_dst).bytes) {                                     \
                case 2:                                                     \
                        __asm__ __volatile__ (                              \
-                               _PRE_EFLAGS("0","4","2")                    \
+                               _PRE_EFLAGS("0", "4", "2")                  \
                                _op"w %"_wx"3,%1; "                         \
-                               _POST_EFLAGS("0","4","2")                   \
+                               _POST_EFLAGS("0", "4", "2")                 \
                                : "=m" (_eflags), "=m" ((_dst).val),        \
                                  "=&r" (_tmp)                              \
-                               : _wy ((_src).val), "i" (EFLAGS_MASK) );    \
+                               : _wy ((_src).val), "i" (EFLAGS_MASK));     \
                        break;                                              \
                case 4:                                                     \
                        __asm__ __volatile__ (                              \
-                               _PRE_EFLAGS("0","4","2")                    \
+                               _PRE_EFLAGS("0", "4", "2")                  \
                                _op"l %"_lx"3,%1; "                         \
-                               _POST_EFLAGS("0","4","2")                   \
+                               _POST_EFLAGS("0", "4", "2")                 \
                                : "=m" (_eflags), "=m" ((_dst).val),        \
                                  "=&r" (_tmp)                              \
-                               : _ly ((_src).val), "i" (EFLAGS_MASK) );    \
+                               : _ly ((_src).val), "i" (EFLAGS_MASK));     \
                        break;                                              \
                case 8:                                                     \
                        __emulate_2op_8byte(_op, _src, _dst,                \
@@ -311,16 +313,15 @@ static u16 twobyte_table[256] = {
 #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
        do {                                                                 \
                unsigned long _tmp;                                          \
-               switch ( (_dst).bytes )                                      \
-               {                                                            \
+               switch ((_dst).bytes) {                                      \
                case 1:                                                      \
                        __asm__ __volatile__ (                               \
-                               _PRE_EFLAGS("0","4","2")                     \
+                               _PRE_EFLAGS("0", "4", "2")                   \
                                _op"b %"_bx"3,%1; "                          \
-                               _POST_EFLAGS("0","4","2")                    \
+                               _POST_EFLAGS("0", "4", "2")                  \
                                : "=m" (_eflags), "=m" ((_dst).val),         \
                                  "=&r" (_tmp)                               \
-                               : _by ((_src).val), "i" (EFLAGS_MASK) );     \
+                               : _by ((_src).val), "i" (EFLAGS_MASK));      \
                        break;                                               \
                default:                                                     \
                        __emulate_2op_nobyte(_op, _src, _dst, _eflags,       \
@@ -349,34 +350,33 @@ static u16 twobyte_table[256] = {
        do {                                                            \
                unsigned long _tmp;                                     \
                                                                        \
-               switch ( (_dst).bytes )                                 \
-               {                                                       \
+               switch ((_dst).bytes) {                                 \
                case 1:                                                 \
                        __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0","3","2")                \
+                               _PRE_EFLAGS("0", "3", "2")              \
                                _op"b %1; "                             \
-                               _POST_EFLAGS("0","3","2")               \
+                               _POST_EFLAGS("0", "3", "2")             \
                                : "=m" (_eflags), "=m" ((_dst).val),    \
                                  "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK) );                  \
+                               : "i" (EFLAGS_MASK));                   \
                        break;                                          \
                case 2:                                                 \
                        __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0","3","2")                \
+                               _PRE_EFLAGS("0", "3", "2")              \
                                _op"w %1; "                             \
-                               _POST_EFLAGS("0","3","2")               \
+                               _POST_EFLAGS("0", "3", "2")             \
                                : "=m" (_eflags), "=m" ((_dst).val),    \
                                  "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK) );                  \
+                               : "i" (EFLAGS_MASK));                   \
                        break;                                          \
                case 4:                                                 \
                        __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0","3","2")                \
+                               _PRE_EFLAGS("0", "3", "2")              \
                                _op"l %1; "                             \
-                               _POST_EFLAGS("0","3","2")               \
+                               _POST_EFLAGS("0", "3", "2")             \
                                : "=m" (_eflags), "=m" ((_dst).val),    \
                                  "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK) );                  \
+                               : "i" (EFLAGS_MASK));                   \
                        break;                                          \
                case 8:                                                 \
                        __emulate_1op_8byte(_op, _dst, _eflags);        \
@@ -389,21 +389,21 @@ static u16 twobyte_table[256] = {
 #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
        do {                                                              \
                __asm__ __volatile__ (                                    \
-                       _PRE_EFLAGS("0","4","2")                          \
+                       _PRE_EFLAGS("0", "4", "2")                        \
                        _op"q %"_qx"3,%1; "                               \
-                       _POST_EFLAGS("0","4","2")                         \
+                       _POST_EFLAGS("0", "4", "2")                       \
                        : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-                       : _qy ((_src).val), "i" (EFLAGS_MASK) );          \
+                       : _qy ((_src).val), "i" (EFLAGS_MASK));         \
        } while (0)
 
 #define __emulate_1op_8byte(_op, _dst, _eflags)                           \
        do {                                                              \
                __asm__ __volatile__ (                                    \
-                       _PRE_EFLAGS("0","3","2")                          \
+                       _PRE_EFLAGS("0", "3", "2")                        \
                        _op"q %1; "                                       \
-                       _POST_EFLAGS("0","3","2")                         \
+                       _POST_EFLAGS("0", "3", "2")                       \
                        : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-                       : "i" (EFLAGS_MASK) );                            \
+                       : "i" (EFLAGS_MASK));                             \
        } while (0)
 
 #elif defined(__i386__)
@@ -414,9 +414,8 @@ static u16 twobyte_table[256] = {
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)                                  \
 ({     unsigned long _x;                                               \
-       rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x,  \
-                                                  (_size), ctxt->vcpu); \
-       if ( rc != 0 )                                                  \
+       rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));            \
+       if (rc != 0)                                                    \
                goto done;                                              \
        (_eip) += (_size);                                              \
        (_type)_x;                                                      \
@@ -446,6 +445,41 @@ static u16 twobyte_table[256] = {
                register_address_increment(c->eip, rel);                \
        } while (0)
 
+static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
+                             struct x86_emulate_ops *ops,
+                             unsigned long linear, u8 *dest)
+{
+       struct fetch_cache *fc = &ctxt->decode.fetch;
+       int rc;
+       int size;
+
+       if (linear < fc->start || linear >= fc->end) {
+               size = min(15UL, PAGE_SIZE - offset_in_page(linear));
+               rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
+               if (rc)
+                       return rc;
+               fc->start = linear;
+               fc->end = linear + size;
+       }
+       *dest = fc->data[linear - fc->start];
+       return 0;
+}
+
+static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
+                        struct x86_emulate_ops *ops,
+                        unsigned long eip, void *dest, unsigned size)
+{
+       int rc = 0;
+
+       eip += ctxt->cs_base;
+       while (size--) {
+               rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+               if (rc)
+                       return rc;
+       }
+       return 0;
+}
+
 /*
  * Given the 'reg' portion of a ModRM byte, and a register block, return a
  * pointer into the block that addresses the relevant register.
@@ -516,15 +550,215 @@ static int test_cc(unsigned int condition, unsigned int flags)
        return (!!rc ^ (condition & 1));
 }
 
+static void decode_register_operand(struct operand *op,
+                                   struct decode_cache *c,
+                                   int inhibit_bytereg)
+{
+       unsigned reg = c->modrm_reg;
+       int highbyte_regs = c->rex_prefix == 0;
+
+       if (!(c->d & ModRM))
+               reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+       op->type = OP_REG;
+       if ((c->d & ByteOp) && !inhibit_bytereg) {
+               op->ptr = decode_register(reg, c->regs, highbyte_regs);
+               op->val = *(u8 *)op->ptr;
+               op->bytes = 1;
+       } else {
+               op->ptr = decode_register(reg, c->regs, 0);
+               op->bytes = c->op_bytes;
+               switch (op->bytes) {
+               case 2:
+                       op->val = *(u16 *)op->ptr;
+                       break;
+               case 4:
+                       op->val = *(u32 *)op->ptr;
+                       break;
+               case 8:
+                       op->val = *(u64 *) op->ptr;
+                       break;
+               }
+       }
+       op->orig_val = op->val;
+}
+
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
+                       struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       u8 sib;
+       int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
+       int rc = 0;
+
+       if (c->rex_prefix) {
+               c->modrm_reg = (c->rex_prefix & 4) << 1;        /* REX.R */
+               index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
+               c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
+       }
+
+       c->modrm = insn_fetch(u8, 1, c->eip);
+       c->modrm_mod |= (c->modrm & 0xc0) >> 6;
+       c->modrm_reg |= (c->modrm & 0x38) >> 3;
+       c->modrm_rm |= (c->modrm & 0x07);
+       c->modrm_ea = 0;
+       c->use_modrm_ea = 1;
+
+       if (c->modrm_mod == 3) {
+               c->modrm_val = *(unsigned long *)
+                       decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
+               return rc;
+       }
+
+       if (c->ad_bytes == 2) {
+               unsigned bx = c->regs[VCPU_REGS_RBX];
+               unsigned bp = c->regs[VCPU_REGS_RBP];
+               unsigned si = c->regs[VCPU_REGS_RSI];
+               unsigned di = c->regs[VCPU_REGS_RDI];
+
+               /* 16-bit ModR/M decode. */
+               switch (c->modrm_mod) {
+               case 0:
+                       if (c->modrm_rm == 6)
+                               c->modrm_ea += insn_fetch(u16, 2, c->eip);
+                       break;
+               case 1:
+                       c->modrm_ea += insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->modrm_ea += insn_fetch(u16, 2, c->eip);
+                       break;
+               }
+               switch (c->modrm_rm) {
+               case 0:
+                       c->modrm_ea += bx + si;
+                       break;
+               case 1:
+                       c->modrm_ea += bx + di;
+                       break;
+               case 2:
+                       c->modrm_ea += bp + si;
+                       break;
+               case 3:
+                       c->modrm_ea += bp + di;
+                       break;
+               case 4:
+                       c->modrm_ea += si;
+                       break;
+               case 5:
+                       c->modrm_ea += di;
+                       break;
+               case 6:
+                       if (c->modrm_mod != 0)
+                               c->modrm_ea += bp;
+                       break;
+               case 7:
+                       c->modrm_ea += bx;
+                       break;
+               }
+               if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
+                   (c->modrm_rm == 6 && c->modrm_mod != 0))
+                       if (!c->override_base)
+                               c->override_base = &ctxt->ss_base;
+               c->modrm_ea = (u16)c->modrm_ea;
+       } else {
+               /* 32/64-bit ModR/M decode. */
+               switch (c->modrm_rm) {
+               case 4:
+               case 12:
+                       sib = insn_fetch(u8, 1, c->eip);
+                       index_reg |= (sib >> 3) & 7;
+                       base_reg |= sib & 7;
+                       scale = sib >> 6;
+
+                       switch (base_reg) {
+                       case 5:
+                               if (c->modrm_mod != 0)
+                                       c->modrm_ea += c->regs[base_reg];
+                               else
+                                       c->modrm_ea +=
+                                               insn_fetch(s32, 4, c->eip);
+                               break;
+                       default:
+                               c->modrm_ea += c->regs[base_reg];
+                       }
+                       switch (index_reg) {
+                       case 4:
+                               break;
+                       default:
+                               c->modrm_ea += c->regs[index_reg] << scale;
+                       }
+                       break;
+               case 5:
+                       if (c->modrm_mod != 0)
+                               c->modrm_ea += c->regs[c->modrm_rm];
+                       else if (ctxt->mode == X86EMUL_MODE_PROT64)
+                               rip_relative = 1;
+                       break;
+               default:
+                       c->modrm_ea += c->regs[c->modrm_rm];
+                       break;
+               }
+               switch (c->modrm_mod) {
+               case 0:
+                       if (c->modrm_rm == 5)
+                               c->modrm_ea += insn_fetch(s32, 4, c->eip);
+                       break;
+               case 1:
+                       c->modrm_ea += insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->modrm_ea += insn_fetch(s32, 4, c->eip);
+                       break;
+               }
+       }
+       if (rip_relative) {
+               c->modrm_ea += c->eip;
+               switch (c->d & SrcMask) {
+               case SrcImmByte:
+                       c->modrm_ea += 1;
+                       break;
+               case SrcImm:
+                       if (c->d & ByteOp)
+                               c->modrm_ea += 1;
+                       else
+                               if (c->op_bytes == 8)
+                                       c->modrm_ea += 4;
+                               else
+                                       c->modrm_ea += c->op_bytes;
+               }
+       }
+done:
+       return rc;
+}
+
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
+                     struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+
+       switch (c->ad_bytes) {
+       case 2:
+               c->modrm_ea = insn_fetch(u16, 2, c->eip);
+               break;
+       case 4:
+               c->modrm_ea = insn_fetch(u32, 4, c->eip);
+               break;
+       case 8:
+               c->modrm_ea = insn_fetch(u64, 8, c->eip);
+               break;
+       }
+done:
+       return rc;
+}
+
 int
 x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
        struct decode_cache *c = &ctxt->decode;
-       u8 sib, rex_prefix = 0;
-       unsigned int i;
        int rc = 0;
        int mode = ctxt->mode;
-       int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
+       int def_op_bytes, def_ad_bytes;
 
        /* Shadow copy of register state. Committed on successful emulation. */
 
@@ -535,34 +769,38 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        switch (mode) {
        case X86EMUL_MODE_REAL:
        case X86EMUL_MODE_PROT16:
-               c->op_bytes = c->ad_bytes = 2;
+               def_op_bytes = def_ad_bytes = 2;
                break;
        case X86EMUL_MODE_PROT32:
-               c->op_bytes = c->ad_bytes = 4;
+               def_op_bytes = def_ad_bytes = 4;
                break;
 #ifdef CONFIG_X86_64
        case X86EMUL_MODE_PROT64:
-               c->op_bytes = 4;
-               c->ad_bytes = 8;
+               def_op_bytes = 4;
+               def_ad_bytes = 8;
                break;
 #endif
        default:
                return -1;
        }
 
+       c->op_bytes = def_op_bytes;
+       c->ad_bytes = def_ad_bytes;
+
        /* Legacy prefixes. */
-       for (i = 0; i < 8; i++) {
+       for (;;) {
                switch (c->b = insn_fetch(u8, 1, c->eip)) {
                case 0x66:      /* operand-size override */
-                       c->op_bytes ^= 6;       /* switch between 2/4 bytes */
+                       /* switch between 2/4 bytes */
+                       c->op_bytes = def_op_bytes ^ 6;
                        break;
                case 0x67:      /* address-size override */
                        if (mode == X86EMUL_MODE_PROT64)
                                /* switch between 4/8 bytes */
-                               c->ad_bytes ^= 12;
+                               c->ad_bytes = def_ad_bytes ^ 12;
                        else
                                /* switch between 2/4 bytes */
-                               c->ad_bytes ^= 6;
+                               c->ad_bytes = def_ad_bytes ^ 6;
                        break;
                case 0x2e:      /* CS override */
                        c->override_base = &ctxt->cs_base;
@@ -582,30 +820,35 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
                case 0x36:      /* SS override */
                        c->override_base = &ctxt->ss_base;
                        break;
+               case 0x40 ... 0x4f: /* REX */
+                       if (mode != X86EMUL_MODE_PROT64)
+                               goto done_prefixes;
+                       c->rex_prefix = c->b;
+                       continue;
                case 0xf0:      /* LOCK */
                        c->lock_prefix = 1;
                        break;
                case 0xf2:      /* REPNE/REPNZ */
+                       c->rep_prefix = REPNE_PREFIX;
+                       break;
                case 0xf3:      /* REP/REPE/REPZ */
-                       c->rep_prefix = 1;
+                       c->rep_prefix = REPE_PREFIX;
                        break;
                default:
                        goto done_prefixes;
                }
+
+               /* Any legacy prefix after a REX prefix nullifies its effect. */
+
+               c->rex_prefix = 0;
        }
 
 done_prefixes:
 
        /* REX prefix. */
-       if ((mode == X86EMUL_MODE_PROT64) && ((c->b & 0xf0) == 0x40)) {
-               rex_prefix = c->b;
-               if (c->b & 8)
+       if (c->rex_prefix)
+               if (c->rex_prefix & 8)
                        c->op_bytes = 8;        /* REX.W */
-               c->modrm_reg = (c->b & 4) << 1; /* REX.R */
-               index_reg = (c->b & 2) << 2; /* REX.X */
-               c->modrm_rm = base_reg = (c->b & 1) << 3; /* REG.B */
-               c->b = insn_fetch(u8, 1, c->eip);
-       }
 
        /* Opcode byte(s). */
        c->d = opcode_table[c->b];
@@ -625,159 +868,25 @@ done_prefixes:
        }
 
        /* ModRM and SIB bytes. */
-       if (c->d & ModRM) {
-               c->modrm = insn_fetch(u8, 1, c->eip);
-               c->modrm_mod |= (c->modrm & 0xc0) >> 6;
-               c->modrm_reg |= (c->modrm & 0x38) >> 3;
-               c->modrm_rm |= (c->modrm & 0x07);
-               c->modrm_ea = 0;
-               c->use_modrm_ea = 1;
-
-               if (c->modrm_mod == 3) {
-                       c->modrm_val = *(unsigned long *)
-                         decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
-                       goto modrm_done;
-               }
+       if (c->d & ModRM)
+               rc = decode_modrm(ctxt, ops);
+       else if (c->d & MemAbs)
+               rc = decode_abs(ctxt, ops);
+       if (rc)
+               goto done;
 
-               if (c->ad_bytes == 2) {
-                       unsigned bx = c->regs[VCPU_REGS_RBX];
-                       unsigned bp = c->regs[VCPU_REGS_RBP];
-                       unsigned si = c->regs[VCPU_REGS_RSI];
-                       unsigned di = c->regs[VCPU_REGS_RDI];
+       if (!c->override_base)
+               c->override_base = &ctxt->ds_base;
+       if (mode == X86EMUL_MODE_PROT64 &&
+           c->override_base != &ctxt->fs_base &&
+           c->override_base != &ctxt->gs_base)
+               c->override_base = NULL;
 
-                       /* 16-bit ModR/M decode. */
-                       switch (c->modrm_mod) {
-                       case 0:
-                               if (c->modrm_rm == 6)
-                                       c->modrm_ea +=
-                                               insn_fetch(u16, 2, c->eip);
-                               break;
-                       case 1:
-                               c->modrm_ea += insn_fetch(s8, 1, c->eip);
-                               break;
-                       case 2:
-                               c->modrm_ea += insn_fetch(u16, 2, c->eip);
-                               break;
-                       }
-                       switch (c->modrm_rm) {
-                       case 0:
-                               c->modrm_ea += bx + si;
-                               break;
-                       case 1:
-                               c->modrm_ea += bx + di;
-                               break;
-                       case 2:
-                               c->modrm_ea += bp + si;
-                               break;
-                       case 3:
-                               c->modrm_ea += bp + di;
-                               break;
-                       case 4:
-                               c->modrm_ea += si;
-                               break;
-                       case 5:
-                               c->modrm_ea += di;
-                               break;
-                       case 6:
-                               if (c->modrm_mod != 0)
-                                       c->modrm_ea += bp;
-                               break;
-                       case 7:
-                               c->modrm_ea += bx;
-                               break;
-                       }
-                       if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
-                           (c->modrm_rm == 6 && c->modrm_mod != 0))
-                               if (!c->override_base)
-                                       c->override_base = &ctxt->ss_base;
-                       c->modrm_ea = (u16)c->modrm_ea;
-               } else {
-                       /* 32/64-bit ModR/M decode. */
-                       switch (c->modrm_rm) {
-                       case 4:
-                       case 12:
-                               sib = insn_fetch(u8, 1, c->eip);
-                               index_reg |= (sib >> 3) & 7;
-                               base_reg |= sib & 7;
-                               scale = sib >> 6;
-
-                               switch (base_reg) {
-                               case 5:
-                                       if (c->modrm_mod != 0)
-                                               c->modrm_ea +=
-                                                       c->regs[base_reg];
-                                       else
-                                               c->modrm_ea +=
-                                                   insn_fetch(s32, 4, c->eip);
-                                       break;
-                               default:
-                                       c->modrm_ea += c->regs[base_reg];
-                               }
-                               switch (index_reg) {
-                               case 4:
-                                       break;
-                               default:
-                                       c->modrm_ea +=
-                                               c->regs[index_reg] << scale;
-
-                               }
-                               break;
-                       case 5:
-                               if (c->modrm_mod != 0)
-                                       c->modrm_ea += c->regs[c->modrm_rm];
-                               else if (mode == X86EMUL_MODE_PROT64)
-                                       rip_relative = 1;
-                               break;
-                       default:
-                               c->modrm_ea += c->regs[c->modrm_rm];
-                               break;
-                       }
-                       switch (c->modrm_mod) {
-                       case 0:
-                               if (c->modrm_rm == 5)
-                                       c->modrm_ea +=
-                                               insn_fetch(s32, 4, c->eip);
-                               break;
-                       case 1:
-                               c->modrm_ea += insn_fetch(s8, 1, c->eip);
-                               break;
-                       case 2:
-                               c->modrm_ea += insn_fetch(s32, 4, c->eip);
-                               break;
-                       }
-               }
-               if (!c->override_base)
-                       c->override_base = &ctxt->ds_base;
-               if (mode == X86EMUL_MODE_PROT64 &&
-                   c->override_base != &ctxt->fs_base &&
-                   c->override_base != &ctxt->gs_base)
-                       c->override_base = NULL;
-
-               if (c->override_base)
-                       c->modrm_ea += *c->override_base;
-
-               if (rip_relative) {
-                       c->modrm_ea += c->eip;
-                       switch (c->d & SrcMask) {
-                       case SrcImmByte:
-                               c->modrm_ea += 1;
-                               break;
-                       case SrcImm:
-                               if (c->d & ByteOp)
-                                       c->modrm_ea += 1;
-                               else
-                                       if (c->op_bytes == 8)
-                                               c->modrm_ea += 4;
-                                       else
-                                               c->modrm_ea += c->op_bytes;
-                       }
-               }
-               if (c->ad_bytes != 8)
-                       c->modrm_ea = (u32)c->modrm_ea;
-       modrm_done:
-               ;
-       }
+       if (c->override_base)
+               c->modrm_ea += *c->override_base;
 
+       if (c->ad_bytes != 8)
+               c->modrm_ea = (u32)c->modrm_ea;
        /*
         * Decode and fetch the source operand: register, memory
         * or immediate.
@@ -786,31 +895,7 @@ done_prefixes:
        case SrcNone:
                break;
        case SrcReg:
-               c->src.type = OP_REG;
-               if (c->d & ByteOp) {
-                       c->src.ptr =
-                               decode_register(c->modrm_reg, c->regs,
-                                                 (rex_prefix == 0));
-                       c->src.val = c->src.orig_val = *(u8 *)c->src.ptr;
-                       c->src.bytes = 1;
-               } else {
-                       c->src.ptr =
-                           decode_register(c->modrm_reg, c->regs, 0);
-                       switch ((c->src.bytes = c->op_bytes)) {
-                       case 2:
-                               c->src.val = c->src.orig_val =
-                                                      *(u16 *) c->src.ptr;
-                               break;
-                       case 4:
-                               c->src.val = c->src.orig_val =
-                                                      *(u32 *) c->src.ptr;
-                               break;
-                       case 8:
-                               c->src.val = c->src.orig_val =
-                                                      *(u64 *) c->src.ptr;
-                               break;
-                       }
-               }
+               decode_register_operand(&c->src, c, 0);
                break;
        case SrcMem16:
                c->src.bytes = 2;
@@ -822,10 +907,9 @@ done_prefixes:
                c->src.bytes = (c->d & ByteOp) ? 1 :
                                                           c->op_bytes;
                /* Don't fetch the address for invlpg: it could be unmapped. */
-               if (c->twobyte && c->b == 0x01
-                                   && c->modrm_reg == 7)
+               if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
                        break;
-             srcmem_common:
+       srcmem_common:
                /*
                 * For instructions with a ModR/M byte, switch to register
                 * access if Mod = 3.
@@ -869,30 +953,8 @@ done_prefixes:
                /* Special instructions do their own operand decoding. */
                return 0;
        case DstReg:
-               c->dst.type = OP_REG;
-               if ((c->d & ByteOp)
-                   && !(c->twobyte &&
-                       (c->b == 0xb6 || c->b == 0xb7))) {
-                       c->dst.ptr =
-                               decode_register(c->modrm_reg, c->regs,
-                                                 (rex_prefix == 0));
-                       c->dst.val = *(u8 *) c->dst.ptr;
-                       c->dst.bytes = 1;
-               } else {
-                       c->dst.ptr =
-                           decode_register(c->modrm_reg, c->regs, 0);
-                       switch ((c->dst.bytes = c->op_bytes)) {
-                       case 2:
-                               c->dst.val = *(u16 *)c->dst.ptr;
-                               break;
-                       case 4:
-                               c->dst.val = *(u32 *)c->dst.ptr;
-                               break;
-                       case 8:
-                               c->dst.val = *(u64 *)c->dst.ptr;
-                               break;
-                       }
-               }
+               decode_register_operand(&c->dst, c,
+                        c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
                break;
        case DstMem:
                if ((c->d & ModRM) && c->modrm_mod == 3) {
@@ -907,26 +969,273 @@ done:
        return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+{
+       struct decode_cache *c = &ctxt->decode;
+
+       c->dst.type  = OP_MEM;
+       c->dst.bytes = c->op_bytes;
+       c->dst.val = c->src.val;
+       register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
+       c->dst.ptr = (void *) register_address(ctxt->ss_base,
+                                              c->regs[VCPU_REGS_RSP]);
+}
+
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+                               struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc;
+
+       /* 64-bit mode: POP always pops a 64-bit operand. */
+
+       if (ctxt->mode == X86EMUL_MODE_PROT64)
+               c->dst.bytes = 8;
+
+       rc = ops->read_std(register_address(ctxt->ss_base,
+                                           c->regs[VCPU_REGS_RSP]),
+                          &c->dst.val, c->dst.bytes, ctxt->vcpu);
+       if (rc != 0)
+               return rc;
+
+       register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
+
+       return 0;
+}
+
+static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+{
+       struct decode_cache *c = &ctxt->decode;
+       switch (c->modrm_reg) {
+       case 0: /* rol */
+               emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
+               break;
+       case 1: /* ror */
+               emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
+               break;
+       case 2: /* rcl */
+               emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
+               break;
+       case 3: /* rcr */
+               emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
+               break;
+       case 4: /* sal/shl */
+       case 6: /* sal/shl */
+               emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
+               break;
+       case 5: /* shr */
+               emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
+               break;
+       case 7: /* sar */
+               emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
+               break;
+       }
+}
+
+static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
+                              struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+
+       switch (c->modrm_reg) {
+       case 0 ... 1:   /* test */
+               /*
+                * Special case in Grp3: test has an immediate
+                * source operand.
+                */
+               c->src.type = OP_IMM;
+               c->src.ptr = (unsigned long *)c->eip;
+               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               if (c->src.bytes == 8)
+                       c->src.bytes = 4;
+               switch (c->src.bytes) {
+               case 1:
+                       c->src.val = insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->src.val = insn_fetch(s16, 2, c->eip);
+                       break;
+               case 4:
+                       c->src.val = insn_fetch(s32, 4, c->eip);
+                       break;
+               }
+               emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+               break;
+       case 2: /* not */
+               c->dst.val = ~c->dst.val;
+               break;
+       case 3: /* neg */
+               emulate_1op("neg", c->dst, ctxt->eflags);
+               break;
+       default:
+               DPRINTF("Cannot emulate %02x\n", c->b);
+               rc = X86EMUL_UNHANDLEABLE;
+               break;
+       }
+done:
+       return rc;
+}
+
+static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
+                              struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc;
+
+       switch (c->modrm_reg) {
+       case 0: /* inc */
+               emulate_1op("inc", c->dst, ctxt->eflags);
+               break;
+       case 1: /* dec */
+               emulate_1op("dec", c->dst, ctxt->eflags);
+               break;
+       case 4: /* jmp abs */
+               if (c->b == 0xff)
+                       c->eip = c->dst.val;
+               else {
+                       DPRINTF("Cannot emulate %02x\n", c->b);
+                       return X86EMUL_UNHANDLEABLE;
+               }
+               break;
+       case 6: /* push */
+
+               /* 64-bit mode: PUSH always pushes a 64-bit operand. */
+
+               if (ctxt->mode == X86EMUL_MODE_PROT64) {
+                       c->dst.bytes = 8;
+                       rc = ops->read_std((unsigned long)c->dst.ptr,
+                                          &c->dst.val, 8, ctxt->vcpu);
+                       if (rc != 0)
+                               return rc;
+               }
+               register_address_increment(c->regs[VCPU_REGS_RSP],
+                                          -c->dst.bytes);
+               rc = ops->write_emulated(register_address(ctxt->ss_base,
+                                   c->regs[VCPU_REGS_RSP]), &c->dst.val,
+                                   c->dst.bytes, ctxt->vcpu);
+               if (rc != 0)
+                       return rc;
+               c->dst.type = OP_NONE;
+               break;
+       default:
+               DPRINTF("Cannot emulate %02x\n", c->b);
+               return X86EMUL_UNHANDLEABLE;
+       }
+       return 0;
+}
+
+static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
+                              struct x86_emulate_ops *ops,
+                              unsigned long cr2)
+{
+       struct decode_cache *c = &ctxt->decode;
+       u64 old, new;
+       int rc;
+
+       rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu);
+       if (rc != 0)
+               return rc;
+
+       if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
+           ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
+
+               c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
+               c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+               ctxt->eflags &= ~EFLG_ZF;
+
+       } else {
+               new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+                      (u32) c->regs[VCPU_REGS_RBX];
+
+               rc = ops->cmpxchg_emulated(cr2, &old, &new, 8, ctxt->vcpu);
+               if (rc != 0)
+                       return rc;
+               ctxt->eflags |= EFLG_ZF;
+       }
+       return 0;
+}
+
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+                           struct x86_emulate_ops *ops)
+{
+       int rc;
+       struct decode_cache *c = &ctxt->decode;
+
+       switch (c->dst.type) {
+       case OP_REG:
+               /* The 4-byte case *is* correct:
+                * in 64-bit mode we zero-extend.
+                */
+               switch (c->dst.bytes) {
+               case 1:
+                       *(u8 *)c->dst.ptr = (u8)c->dst.val;
+                       break;
+               case 2:
+                       *(u16 *)c->dst.ptr = (u16)c->dst.val;
+                       break;
+               case 4:
+                       *c->dst.ptr = (u32)c->dst.val;
+                       break;  /* 64b: zero-ext */
+               case 8:
+                       *c->dst.ptr = c->dst.val;
+                       break;
+               }
+               break;
+       case OP_MEM:
+               if (c->lock_prefix)
+                       rc = ops->cmpxchg_emulated(
+                                       (unsigned long)c->dst.ptr,
+                                       &c->dst.orig_val,
+                                       &c->dst.val,
+                                       c->dst.bytes,
+                                       ctxt->vcpu);
+               else
+                       rc = ops->write_emulated(
+                                       (unsigned long)c->dst.ptr,
+                                       &c->dst.val,
+                                       c->dst.bytes,
+                                       ctxt->vcpu);
+               if (rc != 0)
+                       return rc;
+               break;
+       case OP_NONE:
+               /* no writeback */
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
        unsigned long cr2 = ctxt->cr2;
-       int no_wb = 0;
        u64 msr_data;
-       unsigned long _eflags = ctxt->eflags;
+       unsigned long saved_eip = 0;
        struct decode_cache *c = &ctxt->decode;
        int rc = 0;
 
-       if ((c->d & ModRM) && (c->modrm_mod != 3))
+       /* Shadow copy of register state. Committed on successful emulation.
+        * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
+        * modify them.
+        */
+
+       memcpy(c->regs, ctxt->vcpu->regs, sizeof c->regs);
+       saved_eip = c->eip;
+
+       if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
                cr2 = c->modrm_ea;
 
        if (c->src.type == OP_MEM) {
                c->src.ptr = (unsigned long *)cr2;
                c->src.val = 0;
-               if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
-                                            &c->src.val,
-                                            c->src.bytes,
-                                            ctxt->vcpu)) != 0)
+               rc = ops->read_emulated((unsigned long)c->src.ptr,
+                                       &c->src.val,
+                                       c->src.bytes,
+                                       ctxt->vcpu);
+               if (rc != 0)
                        goto done;
                c->src.orig_val = c->src.val;
        }
@@ -960,23 +1269,23 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        switch (c->b) {
        case 0x00 ... 0x05:
              add:              /* add */
-               emulate_2op_SrcV("add", c->src, c->dst, _eflags);
+               emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
                break;
        case 0x08 ... 0x0d:
              or:               /* or */
-               emulate_2op_SrcV("or", c->src, c->dst, _eflags);
+               emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
                break;
        case 0x10 ... 0x15:
              adc:              /* adc */
-               emulate_2op_SrcV("adc", c->src, c->dst, _eflags);
+               emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
                break;
        case 0x18 ... 0x1d:
              sbb:              /* sbb */
-               emulate_2op_SrcV("sbb", c->src, c->dst, _eflags);
+               emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
                break;
        case 0x20 ... 0x23:
              and:              /* and */
-               emulate_2op_SrcV("and", c->src, c->dst, _eflags);
+               emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
                break;
        case 0x24:              /* and al imm8 */
                c->dst.type = OP_REG;
@@ -997,15 +1306,41 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
                goto and;
        case 0x28 ... 0x2d:
              sub:              /* sub */
-               emulate_2op_SrcV("sub", c->src, c->dst, _eflags);
+               emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
                break;
        case 0x30 ... 0x35:
              xor:              /* xor */
-               emulate_2op_SrcV("xor", c->src, c->dst, _eflags);
+               emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
                break;
        case 0x38 ... 0x3d:
              cmp:              /* cmp */
-               emulate_2op_SrcV("cmp", c->src, c->dst, _eflags);
+               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x40 ... 0x47: /* inc r16/r32 */
+               emulate_1op("inc", c->dst, ctxt->eflags);
+               break;
+       case 0x48 ... 0x4f: /* dec r16/r32 */
+               emulate_1op("dec", c->dst, ctxt->eflags);
+               break;
+       case 0x50 ... 0x57:  /* push reg */
+               c->dst.type  = OP_MEM;
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = c->src.val;
+               register_address_increment(c->regs[VCPU_REGS_RSP],
+                                          -c->op_bytes);
+               c->dst.ptr = (void *) register_address(
+                       ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
+               break;
+       case 0x58 ... 0x5f: /* pop reg */
+       pop_instruction:
+               if ((rc = ops->read_std(register_address(ctxt->ss_base,
+                       c->regs[VCPU_REGS_RSP]), c->dst.ptr,
+                       c->op_bytes, ctxt->vcpu)) != 0)
+                       goto done;
+
+               register_address_increment(c->regs[VCPU_REGS_RSP],
+                                          c->op_bytes);
+               c->dst.type = OP_NONE;  /* Disable writeback. */
                break;
        case 0x63:              /* movsxd */
                if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -1033,8 +1368,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
                }
                break;
        case 0x84 ... 0x85:
-             test:             /* test */
-               emulate_2op_SrcV("test", c->src, c->dst, _eflags);
+               emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
                break;
        case 0x86 ... 0x87:     /* xchg */
                /* Write back the register source. */
@@ -1065,56 +1399,19 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
                c->dst.val = c->modrm_val;
                break;
        case 0x8f:              /* pop (sole member of Grp1a) */
-               /* 64-bit mode: POP always pops a 64-bit operand. */
-               if (ctxt->mode == X86EMUL_MODE_PROT64)
-                       c->dst.bytes = 8;
-               if ((rc = ops->read_std(register_address(
-                                                  ctxt->ss_base,
-                                                  c->regs[VCPU_REGS_RSP]),
-                                                  &c->dst.val,
-                                                  c->dst.bytes,
-                                                  ctxt->vcpu)) != 0)
+               rc = emulate_grp1a(ctxt, ops);
+               if (rc != 0)
                        goto done;
-               register_address_increment(c->regs[VCPU_REGS_RSP],
-                                          c->dst.bytes);
                break;
        case 0xa0 ... 0xa1:     /* mov */
                c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
                c->dst.val = c->src.val;
-               /* skip src displacement */
-               c->eip += c->ad_bytes;
                break;
        case 0xa2 ... 0xa3:     /* mov */
                c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
-               /* skip c->dst displacement */
-               c->eip += c->ad_bytes;
                break;
        case 0xc0 ... 0xc1:
-             grp2:             /* Grp2 */
-               switch (c->modrm_reg) {
-               case 0: /* rol */
-                       emulate_2op_SrcB("rol", c->src, c->dst, _eflags);
-                       break;
-               case 1: /* ror */
-                       emulate_2op_SrcB("ror", c->src, c->dst, _eflags);
-                       break;
-               case 2: /* rcl */
-                       emulate_2op_SrcB("rcl", c->src, c->dst, _eflags);
-                       break;
-               case 3: /* rcr */
-                       emulate_2op_SrcB("rcr", c->src, c->dst, _eflags);
-                       break;
-               case 4: /* sal/shl */
-               case 6: /* sal/shl */
-                       emulate_2op_SrcB("sal", c->src, c->dst, _eflags);
-                       break;
-               case 5: /* shr */
-                       emulate_2op_SrcB("shr", c->src, c->dst, _eflags);
-                       break;
-               case 7: /* sar */
-                       emulate_2op_SrcB("sar", c->src, c->dst, _eflags);
-                       break;
-               }
+               emulate_grp2(ctxt);
                break;
        case 0xc6 ... 0xc7:     /* mov (sole member of Grp11) */
        mov:
@@ -1122,177 +1419,48 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
                break;
        case 0xd0 ... 0xd1:     /* Grp2 */
                c->src.val = 1;
-               goto grp2;
+               emulate_grp2(ctxt);
+               break;
        case 0xd2 ... 0xd3:     /* Grp2 */
                c->src.val = c->regs[VCPU_REGS_RCX];
-               goto grp2;
+               emulate_grp2(ctxt);
+               break;
        case 0xf6 ... 0xf7:     /* Grp3 */
-               switch (c->modrm_reg) {
-               case 0 ... 1:   /* test */
-                       /*
-                        * Special case in Grp3: test has an immediate
-                        * source operand.
-                        */
-                       c->src.type = OP_IMM;
-                       c->src.ptr = (unsigned long *)c->eip;
-                       c->src.bytes = (c->d & ByteOp) ? 1 :
-                                                              c->op_bytes;
-                       if (c->src.bytes == 8)
-                               c->src.bytes = 4;
-                       switch (c->src.bytes) {
-                       case 1:
-                               c->src.val = insn_fetch(s8, 1, c->eip);
-                               break;
-                       case 2:
-                               c->src.val = insn_fetch(s16, 2, c->eip);
-                               break;
-                       case 4:
-                               c->src.val = insn_fetch(s32, 4, c->eip);
-                               break;
-                       }
-                       goto test;
-               case 2: /* not */
-                       c->dst.val = ~c->dst.val;
-                       break;
-               case 3: /* neg */
-                       emulate_1op("neg", c->dst, _eflags);
-                       break;
-               default:
-                       goto cannot_emulate;
-               }
+               rc = emulate_grp3(ctxt, ops);
+               if (rc != 0)
+                       goto done;
                break;
        case 0xfe ... 0xff:     /* Grp4/Grp5 */
-               switch (c->modrm_reg) {
-               case 0: /* inc */
-                       emulate_1op("inc", c->dst, _eflags);
-                       break;
-               case 1: /* dec */
-                       emulate_1op("dec", c->dst, _eflags);
-                       break;
-               case 4: /* jmp abs */
-                       if (c->b == 0xff)
-                               c->eip = c->dst.val;
-                       else
-                               goto cannot_emulate;
-                       break;
-               case 6: /* push */
-                       /* 64-bit mode: PUSH always pushes a 64-bit operand. */
-                       if (ctxt->mode == X86EMUL_MODE_PROT64) {
-                               c->dst.bytes = 8;
-                               if ((rc = ops->read_std(
-                                                (unsigned long)c->dst.ptr,
-                                                &c->dst.val, 8,
-                                                ctxt->vcpu)) != 0)
-                                       goto done;
-                       }
-                       register_address_increment(c->regs[VCPU_REGS_RSP],
-                                                  -c->dst.bytes);
-                       if ((rc = ops->write_emulated(
-                                    register_address(ctxt->ss_base,
-                                         c->regs[VCPU_REGS_RSP]),
-                                         &c->dst.val,
-                                          c->dst.bytes, ctxt->vcpu)) != 0)
-                               goto done;
-                       no_wb = 1;
-                       break;
-               default:
-                       goto cannot_emulate;
-               }
+               rc = emulate_grp45(ctxt, ops);
+               if (rc != 0)
+                       goto done;
                break;
        }
 
 writeback:
-       if (!no_wb) {
-               switch (c->dst.type) {
-               case OP_REG:
-                       /* The 4-byte case *is* correct:
-                        * in 64-bit mode we zero-extend.
-                        */
-                       switch (c->dst.bytes) {
-                       case 1:
-                               *(u8 *)c->dst.ptr = (u8)c->dst.val;
-                               break;
-                       case 2:
-                               *(u16 *)c->dst.ptr = (u16)c->dst.val;
-                               break;
-                       case 4:
-                               *c->dst.ptr = (u32)c->dst.val;
-                               break;  /* 64b: zero-ext */
-                       case 8:
-                               *c->dst.ptr = c->dst.val;
-                               break;
-                       }
-                       break;
-               case OP_MEM:
-                       if (c->lock_prefix)
-                               rc = ops->cmpxchg_emulated(
-                                               (unsigned long)c->dst.ptr,
-                                               &c->dst.orig_val,
-                                               &c->dst.val,
-                                               c->dst.bytes,
-                                               ctxt->vcpu);
-                       else
-                               rc = ops->write_emulated(
-                                               (unsigned long)c->dst.ptr,
-                                               &c->dst.val,
-                                               c->dst.bytes,
-                                               ctxt->vcpu);
-                       if (rc != 0)
-                               goto done;
-               default:
-                       break;
-               }
-       }
+       rc = writeback(ctxt, ops);
+       if (rc != 0)
+               goto done;
 
        /* Commit shadow register state. */
        memcpy(ctxt->vcpu->regs, c->regs, sizeof c->regs);
-       ctxt->eflags = _eflags;
        ctxt->vcpu->rip = c->eip;
 
 done:
-       return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+       if (rc == X86EMUL_UNHANDLEABLE) {
+               c->eip = saved_eip;
+               return -1;
+       }
+       return 0;
 
 special_insn:
        if (c->twobyte)
                goto twobyte_special_insn;
        switch (c->b) {
-       case 0x50 ... 0x57:  /* push reg */
-               if (c->op_bytes == 2)
-                       c->src.val = (u16) c->regs[c->b & 0x7];
-               else
-                       c->src.val = (u32) c->regs[c->b & 0x7];
-               c->dst.type  = OP_MEM;
-               c->dst.bytes = c->op_bytes;
-               c->dst.val = c->src.val;
-               register_address_increment(c->regs[VCPU_REGS_RSP],
-                                          -c->op_bytes);
-               c->dst.ptr = (void *) register_address(
-                       ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
-               break;
-       case 0x58 ... 0x5f: /* pop reg */
-               c->dst.ptr =
-                               (unsigned long *)&c->regs[c->b & 0x7];
-       pop_instruction:
-               if ((rc = ops->read_std(register_address(ctxt->ss_base,
-                       c->regs[VCPU_REGS_RSP]), c->dst.ptr,
-                       c->op_bytes, ctxt->vcpu)) != 0)
-                       goto done;
-
-               register_address_increment(c->regs[VCPU_REGS_RSP],
-                                          c->op_bytes);
-               no_wb = 1; /* Disable writeback. */
-               break;
        case 0x6a: /* push imm8 */
                c->src.val = 0L;
                c->src.val = insn_fetch(s8, 1, c->eip);
-push:
-               c->dst.type  = OP_MEM;
-               c->dst.bytes = c->op_bytes;
-               c->dst.val = c->src.val;
-               register_address_increment(c->regs[VCPU_REGS_RSP],
-                                          -c->op_bytes);
-               c->dst.ptr = (void *) register_address(ctxt->ss_base,
-                                                      c->regs[VCPU_REGS_RSP]);
+               emulate_push(ctxt);
                break;
        case 0x6c:              /* insb */
        case 0x6d:              /* insw/insd */
@@ -1301,12 +1469,14 @@ push:
                                (c->d & ByteOp) ? 1 : c->op_bytes,
                                c->rep_prefix ?
                                address_mask(c->regs[VCPU_REGS_RCX]) : 1,
-                               (_eflags & EFLG_DF),
+                               (ctxt->eflags & EFLG_DF),
                                register_address(ctxt->es_base,
                                                 c->regs[VCPU_REGS_RDI]),
                                c->rep_prefix,
-                               c->regs[VCPU_REGS_RDX]) == 0)
+                               c->regs[VCPU_REGS_RDX]) == 0) {
+                       c->eip = saved_eip;
                        return -1;
+               }
                return 0;
        case 0x6e:              /* outsb */
        case 0x6f:              /* outsw/outsd */
@@ -1315,27 +1485,30 @@ push:
                                (c->d & ByteOp) ? 1 : c->op_bytes,
                                c->rep_prefix ?
                                address_mask(c->regs[VCPU_REGS_RCX]) : 1,
-                               (_eflags & EFLG_DF),
+                               (ctxt->eflags & EFLG_DF),
                                register_address(c->override_base ?
                                                        *c->override_base :
                                                        ctxt->ds_base,
                                                 c->regs[VCPU_REGS_RSI]),
                                c->rep_prefix,
-                               c->regs[VCPU_REGS_RDX]) == 0)
+                               c->regs[VCPU_REGS_RDX]) == 0) {
+                       c->eip = saved_eip;
                        return -1;
+               }
                return 0;
        case 0x70 ... 0x7f: /* jcc (short) */ {
                int rel = insn_fetch(s8, 1, c->eip);
 
-               if (test_cc(c->b, _eflags))
+               if (test_cc(c->b, ctxt->eflags))
                JMP_REL(rel);
                break;
        }
        case 0x9c: /* pushf */
-               c->src.val =  (unsigned long) _eflags;
-               goto push;
+               c->src.val =  (unsigned long) ctxt->eflags;
+               emulate_push(ctxt);
+               break;
        case 0x9d: /* popf */
-               c->dst.ptr = (unsigned long *) &_eflags;
+               c->dst.ptr = (unsigned long *) &ctxt->eflags;
                goto pop_instruction;
        case 0xc3: /* ret */
                c->dst.ptr = &c->eip;
@@ -1343,6 +1516,23 @@ push:
        case 0xf4:              /* hlt */
                ctxt->vcpu->halt_request = 1;
                goto done;
+       case 0xf5:      /* cmc */
+               /* complement carry flag from eflags reg */
+               ctxt->eflags ^= EFLG_CF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xf8: /* clc */
+               ctxt->eflags &= ~EFLG_CF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xfa: /* cli */
+               ctxt->eflags &= ~X86_EFLAGS_IF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xfb: /* sti */
+               ctxt->eflags |= X86_EFLAGS_IF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
        }
        if (c->rep_prefix) {
                if (c->regs[VCPU_REGS_RCX] == 0) {
@@ -1367,10 +1557,10 @@ push:
                                        c->dst.bytes, ctxt->vcpu)) != 0)
                        goto done;
                register_address_increment(c->regs[VCPU_REGS_RSI],
-                                      (_eflags & EFLG_DF) ? -c->dst.bytes
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
                                                           : c->dst.bytes);
                register_address_increment(c->regs[VCPU_REGS_RDI],
-                                      (_eflags & EFLG_DF) ? -c->dst.bytes
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
                                                           : c->dst.bytes);
                break;
        case 0xa6 ... 0xa7:     /* cmps */
@@ -1379,22 +1569,28 @@ push:
        case 0xaa ... 0xab:     /* stos */
                c->dst.type = OP_MEM;
                c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-               c->dst.ptr = (unsigned long *)cr2;
+               c->dst.ptr = (unsigned long *)register_address(
+                                                  ctxt->es_base,
+                                                  c->regs[VCPU_REGS_RDI]);
                c->dst.val = c->regs[VCPU_REGS_RAX];
                register_address_increment(c->regs[VCPU_REGS_RDI],
-                                      (_eflags & EFLG_DF) ? -c->dst.bytes
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
                                                           : c->dst.bytes);
                break;
        case 0xac ... 0xad:     /* lods */
                c->dst.type = OP_REG;
                c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
                c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-               if ((rc = ops->read_emulated(cr2, &c->dst.val,
-                                            c->dst.bytes,
-                                            ctxt->vcpu)) != 0)
+               if ((rc = ops->read_emulated(register_address(
+                               c->override_base ? *c->override_base :
+                                                  ctxt->ds_base,
+                                                c->regs[VCPU_REGS_RSI]),
+                                                &c->dst.val,
+                                                c->dst.bytes,
+                                                ctxt->vcpu)) != 0)
                        goto done;
                register_address_increment(c->regs[VCPU_REGS_RSI],
-                                      (_eflags & EFLG_DF) ? -c->dst.bytes
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
                                                           : c->dst.bytes);
                break;
        case 0xae ... 0xaf:     /* scas */
@@ -1409,9 +1605,6 @@ push:
                case 4:
                        rel = insn_fetch(s32, 4, c->eip);
                        break;
-               case 8:
-                       rel = insn_fetch(s64, 8, c->eip);
-                       break;
                default:
                        DPRINTF("Call: Invalid op_bytes\n");
                        goto cannot_emulate;
@@ -1419,12 +1612,13 @@ push:
                c->src.val = (unsigned long) c->eip;
                JMP_REL(rel);
                c->op_bytes = c->ad_bytes;
-               goto push;
+               emulate_push(ctxt);
+               break;
        }
        case 0xe9: /* jmp rel */
        case 0xeb: /* jmp rel short */
                JMP_REL(c->src.val);
-               no_wb = 1; /* Disable writeback. */
+               c->dst.type = OP_NONE; /* Disable writeback. */
                break;
 
 
@@ -1434,8 +1628,6 @@ push:
 twobyte_insn:
        switch (c->b) {
        case 0x01: /* lgdt, lidt, lmsw */
-               /* Disable writeback. */
-               no_wb = 1;
                switch (c->modrm_reg) {
                        u16 size;
                        unsigned long address;
@@ -1481,7 +1673,8 @@ twobyte_insn:
                case 6: /* lmsw */
                        if (c->modrm_mod != 3)
                                goto cannot_emulate;
-                       realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val, &_eflags);
+                       realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
+                                                 &ctxt->eflags);
                        break;
                case 7: /* invlpg*/
                        emulate_invlpg(ctxt->vcpu, cr2);
@@ -1489,69 +1682,43 @@ twobyte_insn:
                default:
                        goto cannot_emulate;
                }
+               /* Disable writeback. */
+               c->dst.type = OP_NONE;
                break;
        case 0x21: /* mov from dr to reg */
-               no_wb = 1;
                if (c->modrm_mod != 3)
                        goto cannot_emulate;
-               rc = emulator_get_dr(ctxt, c->modrm_reg,
-                                    &c->regs[c->modrm_rm]);
+               rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+               if (rc)
+                       goto cannot_emulate;
+               c->dst.type = OP_NONE;  /* no writeback */
                break;
        case 0x23: /* mov from reg to dr */
-               no_wb = 1;
                if (c->modrm_mod != 3)
                        goto cannot_emulate;
                rc = emulator_set_dr(ctxt, c->modrm_reg,
                                     c->regs[c->modrm_rm]);
+               if (rc)
+                       goto cannot_emulate;
+               c->dst.type = OP_NONE;  /* no writeback */
                break;
        case 0x40 ... 0x4f:     /* cmov */
                c->dst.val = c->dst.orig_val = c->src.val;
-               no_wb = 1;
-               /*
-                * First, assume we're decoding an even cmov opcode
-                * (lsb == 0).
-                */
-               switch ((c->b & 15) >> 1) {
-               case 0: /* cmovo */
-                       no_wb = (_eflags & EFLG_OF) ? 0 : 1;
-                       break;
-               case 1: /* cmovb/cmovc/cmovnae */
-                       no_wb = (_eflags & EFLG_CF) ? 0 : 1;
-                       break;
-               case 2: /* cmovz/cmove */
-                       no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
-                       break;
-               case 3: /* cmovbe/cmovna */
-                       no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
-                       break;
-               case 4: /* cmovs */
-                       no_wb = (_eflags & EFLG_SF) ? 0 : 1;
-                       break;
-               case 5: /* cmovp/cmovpe */
-                       no_wb = (_eflags & EFLG_PF) ? 0 : 1;
-                       break;
-               case 7: /* cmovle/cmovng */
-                       no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
-                       /* fall through */
-               case 6: /* cmovl/cmovnge */
-                       no_wb &= (!(_eflags & EFLG_SF) !=
-                             !(_eflags & EFLG_OF)) ? 0 : 1;
-                       break;
-               }
-               /* Odd cmov opcodes (lsb == 1) have inverted sense. */
-               no_wb ^= c->b & 1;
+               if (!test_cc(c->b, ctxt->eflags))
+                       c->dst.type = OP_NONE; /* no writeback */
                break;
        case 0xa3:
              bt:               /* bt */
+               c->dst.type = OP_NONE;
                /* only subword offset */
                c->src.val &= (c->dst.bytes << 3) - 1;
-               emulate_2op_SrcV_nobyte("bt", c->src, c->dst, _eflags);
+               emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
                break;
        case 0xab:
              bts:              /* bts */
                /* only subword offset */
                c->src.val &= (c->dst.bytes << 3) - 1;
-               emulate_2op_SrcV_nobyte("bts", c->src, c->dst, _eflags);
+               emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
                break;
        case 0xb0 ... 0xb1:     /* cmpxchg */
                /*
@@ -1560,8 +1727,8 @@ twobyte_insn:
                 */
                c->src.orig_val = c->src.val;
                c->src.val = c->regs[VCPU_REGS_RAX];
-               emulate_2op_SrcV("cmp", c->src, c->dst, _eflags);
-               if (_eflags & EFLG_ZF) {
+               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+               if (ctxt->eflags & EFLG_ZF) {
                        /* Success: write back to memory. */
                        c->dst.val = c->src.orig_val;
                } else {
@@ -1574,7 +1741,7 @@ twobyte_insn:
              btr:              /* btr */
                /* only subword offset */
                c->src.val &= (c->dst.bytes << 3) - 1;
-               emulate_2op_SrcV_nobyte("btr", c->src, c->dst, _eflags);
+               emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
                break;
        case 0xb6 ... 0xb7:     /* movzx */
                c->dst.bytes = c->op_bytes;
@@ -1597,7 +1764,7 @@ twobyte_insn:
              btc:              /* btc */
                /* only subword offset */
                c->src.val &= (c->dst.bytes << 3) - 1;
-               emulate_2op_SrcV_nobyte("btc", c->src, c->dst, _eflags);
+               emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
                break;
        case 0xbe ... 0xbf:     /* movsx */
                c->dst.bytes = c->op_bytes;
@@ -1613,8 +1780,6 @@ twobyte_insn:
        goto writeback;
 
 twobyte_special_insn:
-       /* Disable writeback. */
-       no_wb = 1;
        switch (c->b) {
        case 0x06:
                emulate_clts(ctxt->vcpu);
@@ -1636,7 +1801,7 @@ twobyte_special_insn:
                if (c->modrm_mod != 3)
                        goto cannot_emulate;
                realmode_set_cr(ctxt->vcpu,
-                               c->modrm_reg, c->modrm_val, &_eflags);
+                               c->modrm_reg, c->modrm_val, &ctxt->eflags);
                break;
        case 0x30:
                /* wrmsr */
@@ -1651,8 +1816,7 @@ twobyte_special_insn:
                break;
        case 0x32:
                /* rdmsr */
-               rc = kvm_get_msr(ctxt->vcpu,
-                                c->regs[VCPU_REGS_RCX], &msr_data);
+               rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
                if (rc) {
                        kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
                        c->eip = ctxt->vcpu->rip;
@@ -1679,37 +1843,22 @@ twobyte_special_insn:
                        DPRINTF("jnz: Invalid op_bytes\n");
                        goto cannot_emulate;
                }
-               if (test_cc(c->b, _eflags))
+               if (test_cc(c->b, ctxt->eflags))
                        JMP_REL(rel);
                break;
        }
        case 0xc7:              /* Grp9 (cmpxchg8b) */
-               {
-                       u64 old, new;
-                       if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
-                                                                       != 0)
-                               goto done;
-                       if (((u32) (old >> 0) !=
-                                       (u32) c->regs[VCPU_REGS_RAX]) ||
-                           ((u32) (old >> 32) !=
-                                       (u32) c->regs[VCPU_REGS_RDX])) {
-                               c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
-                               c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
-                               _eflags &= ~EFLG_ZF;
-                       } else {
-                               new = ((u64)c->regs[VCPU_REGS_RCX] << 32)
-                                       | (u32) c->regs[VCPU_REGS_RBX];
-                               if ((rc = ops->cmpxchg_emulated(cr2, &old,
-                                                         &new, 8, ctxt->vcpu)) != 0)
-                                       goto done;
-                               _eflags |= EFLG_ZF;
-                       }
-                       break;
-               }
+               rc = emulate_grp9(ctxt, ops, cr2);
+               if (rc != 0)
+                       goto done;
+               break;
        }
+       /* Disable writeback. */
+       c->dst.type = OP_NONE;
        goto writeback;
 
 cannot_emulate:
        DPRINTF("Cannot emulate %02x\n", c->b);
+       c->eip = saved_eip;
        return -1;
 }