KVM: x86 emulator: address size and operand size overrides are sticky
[safe/jmp/linux-2.6] / drivers / kvm / x86_emulate.c
index 4c78a4f..3be506a 100644 (file)
 #include <stdio.h>
 #include <stdint.h>
 #include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf( _f , ## _a )
+#define DPRINTF(_f, _a ...) printf(_f , ## _a)
 #else
 #include "kvm.h"
+#include "x86.h"
 #define DPRINTF(x...) do {} while (0)
 #endif
 #include "x86_emulate.h"
@@ -62,8 +63,9 @@
 /* Destination is only written; never read. */
 #define Mov         (1<<7)
 #define BitOp       (1<<8)
+#define MemAbs      (1<<9)      /* Memory operand is absolute displacement */
 
-static u8 opcode_table[256] = {
+static u16 opcode_table[256] = {
        /* 0x00 - 0x07 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -96,14 +98,14 @@ static u8 opcode_table[256] = {
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
        0, 0, 0, 0,
-       /* 0x40 - 0x4F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x40 - 0x47 */
+       DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+       /* 0x48 - 0x4F */
+       DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
        /* 0x50 - 0x57 */
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       SrcReg, SrcReg, SrcReg, SrcReg, SrcReg, SrcReg, SrcReg, SrcReg,
        /* 0x58 - 0x5F */
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
        /* 0x60 - 0x67 */
        0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
        0, 0, 0, 0,
@@ -127,10 +129,10 @@ static u8 opcode_table[256] = {
        ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
        0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
        /* 0x90 - 0x9F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
        /* 0xA0 - 0xA7 */
-       ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
-       ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
+       ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
+       ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
        ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
        ByteOp | ImplicitOps, ImplicitOps,
        /* 0xA8 - 0xAF */
@@ -157,17 +159,17 @@ static u8 opcode_table[256] = {
        ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
        /* 0xF0 - 0xF7 */
        0, 0, 0, 0,
-       ImplicitOps, 0,
+       ImplicitOps, ImplicitOps,
        ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
        /* 0xF8 - 0xFF */
-       0, 0, 0, 0,
+       ImplicitOps, 0, ImplicitOps, ImplicitOps,
        0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
 };
 
 static u16 twobyte_table[256] = {
        /* 0x00 - 0x0F */
        0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
-       0, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
+       ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
        /* 0x10 - 0x1F */
        0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
        /* 0x20 - 0x2F */
@@ -212,7 +214,8 @@ static u16 twobyte_table[256] = {
        0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
            DstReg | SrcMem16 | ModRM | Mov,
        /* 0xC0 - 0xCF */
-       0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
+       0, 0, 0, 0, 0, 0, 0, 0,
        /* 0xD0 - 0xDF */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        /* 0xE0 - 0xEF */
@@ -221,13 +224,6 @@ static u16 twobyte_table[256] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-/* Type, address-of, and value of an instruction's operand. */
-struct operand {
-       enum { OP_REG, OP_MEM, OP_IMM } type;
-       unsigned int bytes;
-       unsigned long val, orig_val, *ptr;
-};
-
 /* EFLAGS bit definitions. */
 #define EFLG_OF (1<<11)
 #define EFLG_DF (1<<10)
@@ -291,21 +287,21 @@ struct operand {
                switch ((_dst).bytes) {                                     \
                case 2:                                                     \
                        __asm__ __volatile__ (                              \
-                               _PRE_EFLAGS("0","4","2")                    \
+                               _PRE_EFLAGS("0", "4", "2")                  \
                                _op"w %"_wx"3,%1; "                         \
-                               _POST_EFLAGS("0","4","2")                   \
+                               _POST_EFLAGS("0", "4", "2")                 \
                                : "=m" (_eflags), "=m" ((_dst).val),        \
                                  "=&r" (_tmp)                              \
-                               : _wy ((_src).val), "i" (EFLAGS_MASK) );    \
+                               : _wy ((_src).val), "i" (EFLAGS_MASK));     \
                        break;                                              \
                case 4:                                                     \
                        __asm__ __volatile__ (                              \
-                               _PRE_EFLAGS("0","4","2")                    \
+                               _PRE_EFLAGS("0", "4", "2")                  \
                                _op"l %"_lx"3,%1; "                         \
-                               _POST_EFLAGS("0","4","2")                   \
+                               _POST_EFLAGS("0", "4", "2")                 \
                                : "=m" (_eflags), "=m" ((_dst).val),        \
                                  "=&r" (_tmp)                              \
-                               : _ly ((_src).val), "i" (EFLAGS_MASK) );    \
+                               : _ly ((_src).val), "i" (EFLAGS_MASK));     \
                        break;                                              \
                case 8:                                                     \
                        __emulate_2op_8byte(_op, _src, _dst,                \
@@ -317,16 +313,15 @@ struct operand {
 #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
        do {                                                                 \
                unsigned long _tmp;                                          \
-               switch ( (_dst).bytes )                                      \
-               {                                                            \
+               switch ((_dst).bytes) {                                      \
                case 1:                                                      \
                        __asm__ __volatile__ (                               \
-                               _PRE_EFLAGS("0","4","2")                     \
+                               _PRE_EFLAGS("0", "4", "2")                   \
                                _op"b %"_bx"3,%1; "                          \
-                               _POST_EFLAGS("0","4","2")                    \
+                               _POST_EFLAGS("0", "4", "2")                  \
                                : "=m" (_eflags), "=m" ((_dst).val),         \
                                  "=&r" (_tmp)                               \
-                               : _by ((_src).val), "i" (EFLAGS_MASK) );     \
+                               : _by ((_src).val), "i" (EFLAGS_MASK));      \
                        break;                                               \
                default:                                                     \
                        __emulate_2op_nobyte(_op, _src, _dst, _eflags,       \
@@ -355,34 +350,33 @@ struct operand {
        do {                                                            \
                unsigned long _tmp;                                     \
                                                                        \
-               switch ( (_dst).bytes )                                 \
-               {                                                       \
+               switch ((_dst).bytes) {                                 \
                case 1:                                                 \
                        __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0","3","2")                \
+                               _PRE_EFLAGS("0", "3", "2")              \
                                _op"b %1; "                             \
-                               _POST_EFLAGS("0","3","2")               \
+                               _POST_EFLAGS("0", "3", "2")             \
                                : "=m" (_eflags), "=m" ((_dst).val),    \
                                  "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK) );                  \
+                               : "i" (EFLAGS_MASK));                   \
                        break;                                          \
                case 2:                                                 \
                        __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0","3","2")                \
+                               _PRE_EFLAGS("0", "3", "2")              \
                                _op"w %1; "                             \
-                               _POST_EFLAGS("0","3","2")               \
+                               _POST_EFLAGS("0", "3", "2")             \
                                : "=m" (_eflags), "=m" ((_dst).val),    \
                                  "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK) );                  \
+                               : "i" (EFLAGS_MASK));                   \
                        break;                                          \
                case 4:                                                 \
                        __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0","3","2")                \
+                               _PRE_EFLAGS("0", "3", "2")              \
                                _op"l %1; "                             \
-                               _POST_EFLAGS("0","3","2")               \
+                               _POST_EFLAGS("0", "3", "2")             \
                                : "=m" (_eflags), "=m" ((_dst).val),    \
                                  "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK) );                  \
+                               : "i" (EFLAGS_MASK));                   \
                        break;                                          \
                case 8:                                                 \
                        __emulate_1op_8byte(_op, _dst, _eflags);        \
@@ -395,21 +389,21 @@ struct operand {
 #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
        do {                                                              \
                __asm__ __volatile__ (                                    \
-                       _PRE_EFLAGS("0","4","2")                          \
+                       _PRE_EFLAGS("0", "4", "2")                        \
                        _op"q %"_qx"3,%1; "                               \
-                       _POST_EFLAGS("0","4","2")                         \
+                       _POST_EFLAGS("0", "4", "2")                       \
                        : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-                       : _qy ((_src).val), "i" (EFLAGS_MASK) );          \
+                       : _qy ((_src).val), "i" (EFLAGS_MASK));         \
        } while (0)
 
 #define __emulate_1op_8byte(_op, _dst, _eflags)                           \
        do {                                                              \
                __asm__ __volatile__ (                                    \
-                       _PRE_EFLAGS("0","3","2")                          \
+                       _PRE_EFLAGS("0", "3", "2")                        \
                        _op"q %1; "                                       \
-                       _POST_EFLAGS("0","3","2")                         \
+                       _POST_EFLAGS("0", "3", "2")                       \
                        : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-                       : "i" (EFLAGS_MASK) );                            \
+                       : "i" (EFLAGS_MASK));                             \
        } while (0)
 
 #elif defined(__i386__)
@@ -420,9 +414,8 @@ struct operand {
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)                                  \
 ({     unsigned long _x;                                               \
-       rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x,  \
-                                                  (_size), ctxt->vcpu); \
-       if ( rc != 0 )                                                  \
+       rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));            \
+       if (rc != 0)                                                    \
                goto done;                                              \
        (_eip) += (_size);                                              \
        (_type)_x;                                                      \
@@ -430,27 +423,63 @@ struct operand {
 
 /* Access/update address held in a register, based on addressing mode. */
 #define address_mask(reg)                                              \
-       ((ad_bytes == sizeof(unsigned long)) ?                          \
-               (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1)))
+       ((c->ad_bytes == sizeof(unsigned long)) ?                       \
+               (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
 #define register_address(base, reg)                                     \
        ((base) + address_mask(reg))
 #define register_address_increment(reg, inc)                            \
        do {                                                            \
                /* signed type ensures sign extension to long */        \
                int _inc = (inc);                                       \
-               if ( ad_bytes == sizeof(unsigned long) )                \
+               if (c->ad_bytes == sizeof(unsigned long))               \
                        (reg) += _inc;                                  \
                else                                                    \
-                       (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
-                          (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
+                       (reg) = ((reg) &                                \
+                                ~((1UL << (c->ad_bytes << 3)) - 1)) |  \
+                               (((reg) + _inc) &                       \
+                                ((1UL << (c->ad_bytes << 3)) - 1));    \
        } while (0)
 
 #define JMP_REL(rel)                                                   \
        do {                                                            \
-               _eip += (int)(rel);                                     \
-               _eip = ((op_bytes == 2) ? (uint16_t)_eip : (uint32_t)_eip); \
+               register_address_increment(c->eip, rel);                \
        } while (0)
 
+static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
+                             struct x86_emulate_ops *ops,
+                             unsigned long linear, u8 *dest)
+{
+       struct fetch_cache *fc = &ctxt->decode.fetch;
+       int rc;
+       int size;
+
+       if (linear < fc->start || linear >= fc->end) {
+               size = min(15UL, PAGE_SIZE - offset_in_page(linear));
+               rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
+               if (rc)
+                       return rc;
+               fc->start = linear;
+               fc->end = linear + size;
+       }
+       *dest = fc->data[linear - fc->start];
+       return 0;
+}
+
+static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
+                        struct x86_emulate_ops *ops,
+                        unsigned long eip, void *dest, unsigned size)
+{
+       int rc = 0;
+
+       eip += ctxt->cs_base;
+       while (size--) {
+               rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+               if (rc)
+                       return rc;
+       }
+       return 0;
+}
+
 /*
  * Given the 'reg' portion of a ModRM byte, and a register block, return a
  * pointer into the block that addresses the relevant register.
@@ -521,460 +550,805 @@ static int test_cc(unsigned int condition, unsigned int flags)
        return (!!rc ^ (condition & 1));
 }
 
+static void decode_register_operand(struct operand *op,
+                                   struct decode_cache *c,
+                                   int inhibit_bytereg)
+{
+       unsigned reg = c->modrm_reg;
+       int highbyte_regs = c->rex_prefix == 0;
+
+       if (!(c->d & ModRM))
+               reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+       op->type = OP_REG;
+       if ((c->d & ByteOp) && !inhibit_bytereg) {
+               op->ptr = decode_register(reg, c->regs, highbyte_regs);
+               op->val = *(u8 *)op->ptr;
+               op->bytes = 1;
+       } else {
+               op->ptr = decode_register(reg, c->regs, 0);
+               op->bytes = c->op_bytes;
+               switch (op->bytes) {
+               case 2:
+                       op->val = *(u16 *)op->ptr;
+                       break;
+               case 4:
+                       op->val = *(u32 *)op->ptr;
+                       break;
+               case 8:
+                       op->val = *(u64 *) op->ptr;
+                       break;
+               }
+       }
+       op->orig_val = op->val;
+}
+
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
+                       struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       u8 sib;
+       int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
+       int rc = 0;
+
+       if (c->rex_prefix) {
+               c->modrm_reg = (c->rex_prefix & 4) << 1;        /* REX.R */
+               index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
+               c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
+       }
+
+       c->modrm = insn_fetch(u8, 1, c->eip);
+       c->modrm_mod |= (c->modrm & 0xc0) >> 6;
+       c->modrm_reg |= (c->modrm & 0x38) >> 3;
+       c->modrm_rm |= (c->modrm & 0x07);
+       c->modrm_ea = 0;
+       c->use_modrm_ea = 1;
+
+       if (c->modrm_mod == 3) {
+               c->modrm_val = *(unsigned long *)
+                       decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
+               return rc;
+       }
+
+       if (c->ad_bytes == 2) {
+               unsigned bx = c->regs[VCPU_REGS_RBX];
+               unsigned bp = c->regs[VCPU_REGS_RBP];
+               unsigned si = c->regs[VCPU_REGS_RSI];
+               unsigned di = c->regs[VCPU_REGS_RDI];
+
+               /* 16-bit ModR/M decode. */
+               switch (c->modrm_mod) {
+               case 0:
+                       if (c->modrm_rm == 6)
+                               c->modrm_ea += insn_fetch(u16, 2, c->eip);
+                       break;
+               case 1:
+                       c->modrm_ea += insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->modrm_ea += insn_fetch(u16, 2, c->eip);
+                       break;
+               }
+               switch (c->modrm_rm) {
+               case 0:
+                       c->modrm_ea += bx + si;
+                       break;
+               case 1:
+                       c->modrm_ea += bx + di;
+                       break;
+               case 2:
+                       c->modrm_ea += bp + si;
+                       break;
+               case 3:
+                       c->modrm_ea += bp + di;
+                       break;
+               case 4:
+                       c->modrm_ea += si;
+                       break;
+               case 5:
+                       c->modrm_ea += di;
+                       break;
+               case 6:
+                       if (c->modrm_mod != 0)
+                               c->modrm_ea += bp;
+                       break;
+               case 7:
+                       c->modrm_ea += bx;
+                       break;
+               }
+               if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
+                   (c->modrm_rm == 6 && c->modrm_mod != 0))
+                       if (!c->override_base)
+                               c->override_base = &ctxt->ss_base;
+               c->modrm_ea = (u16)c->modrm_ea;
+       } else {
+               /* 32/64-bit ModR/M decode. */
+               switch (c->modrm_rm) {
+               case 4:
+               case 12:
+                       sib = insn_fetch(u8, 1, c->eip);
+                       index_reg |= (sib >> 3) & 7;
+                       base_reg |= sib & 7;
+                       scale = sib >> 6;
+
+                       switch (base_reg) {
+                       case 5:
+                               if (c->modrm_mod != 0)
+                                       c->modrm_ea += c->regs[base_reg];
+                               else
+                                       c->modrm_ea +=
+                                               insn_fetch(s32, 4, c->eip);
+                               break;
+                       default:
+                               c->modrm_ea += c->regs[base_reg];
+                       }
+                       switch (index_reg) {
+                       case 4:
+                               break;
+                       default:
+                               c->modrm_ea += c->regs[index_reg] << scale;
+                       }
+                       break;
+               case 5:
+                       if (c->modrm_mod != 0)
+                               c->modrm_ea += c->regs[c->modrm_rm];
+                       else if (ctxt->mode == X86EMUL_MODE_PROT64)
+                               rip_relative = 1;
+                       break;
+               default:
+                       c->modrm_ea += c->regs[c->modrm_rm];
+                       break;
+               }
+               switch (c->modrm_mod) {
+               case 0:
+                       if (c->modrm_rm == 5)
+                               c->modrm_ea += insn_fetch(s32, 4, c->eip);
+                       break;
+               case 1:
+                       c->modrm_ea += insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->modrm_ea += insn_fetch(s32, 4, c->eip);
+                       break;
+               }
+       }
+       if (rip_relative) {
+               c->modrm_ea += c->eip;
+               switch (c->d & SrcMask) {
+               case SrcImmByte:
+                       c->modrm_ea += 1;
+                       break;
+               case SrcImm:
+                       if (c->d & ByteOp)
+                               c->modrm_ea += 1;
+                       else
+                               if (c->op_bytes == 8)
+                                       c->modrm_ea += 4;
+                               else
+                                       c->modrm_ea += c->op_bytes;
+               }
+       }
+done:
+       return rc;
+}
+
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
+                     struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+
+       switch (c->ad_bytes) {
+       case 2:
+               c->modrm_ea = insn_fetch(u16, 2, c->eip);
+               break;
+       case 4:
+               c->modrm_ea = insn_fetch(u32, 4, c->eip);
+               break;
+       case 8:
+               c->modrm_ea = insn_fetch(u64, 8, c->eip);
+               break;
+       }
+done:
+       return rc;
+}
+
 int
-x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
-       unsigned d;
-       u8 b, sib, twobyte = 0, rex_prefix = 0;
-       u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
-       unsigned long *override_base = NULL;
-       unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
+       struct decode_cache *c = &ctxt->decode;
        int rc = 0;
-       struct operand src, dst;
-       unsigned long cr2 = ctxt->cr2;
        int mode = ctxt->mode;
-       unsigned long modrm_ea;
-       int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
-       int no_wb = 0;
-       u64 msr_data;
+       int def_op_bytes, def_ad_bytes;
 
        /* Shadow copy of register state. Committed on successful emulation. */
-       unsigned long _regs[NR_VCPU_REGS];
-       unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
-       unsigned long modrm_val = 0;
 
-       memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
+       memset(c, 0, sizeof(struct decode_cache));
+       c->eip = ctxt->vcpu->rip;
+       memcpy(c->regs, ctxt->vcpu->regs, sizeof c->regs);
 
        switch (mode) {
        case X86EMUL_MODE_REAL:
        case X86EMUL_MODE_PROT16:
-               op_bytes = ad_bytes = 2;
+               def_op_bytes = def_ad_bytes = 2;
                break;
        case X86EMUL_MODE_PROT32:
-               op_bytes = ad_bytes = 4;
+               def_op_bytes = def_ad_bytes = 4;
                break;
 #ifdef CONFIG_X86_64
        case X86EMUL_MODE_PROT64:
-               op_bytes = 4;
-               ad_bytes = 8;
+               def_op_bytes = 4;
+               def_ad_bytes = 8;
                break;
 #endif
        default:
                return -1;
        }
 
+       c->op_bytes = def_op_bytes;
+       c->ad_bytes = def_ad_bytes;
+
        /* Legacy prefixes. */
-       for (i = 0; i < 8; i++) {
-               switch (b = insn_fetch(u8, 1, _eip)) {
+       for (;;) {
+               switch (c->b = insn_fetch(u8, 1, c->eip)) {
                case 0x66:      /* operand-size override */
-                       op_bytes ^= 6;  /* switch between 2/4 bytes */
+                       /* switch between 2/4 bytes */
+                       c->op_bytes = def_op_bytes ^ 6;
                        break;
                case 0x67:      /* address-size override */
                        if (mode == X86EMUL_MODE_PROT64)
-                               ad_bytes ^= 12; /* switch between 4/8 bytes */
+                               /* switch between 4/8 bytes */
+                               c->ad_bytes = def_ad_bytes ^ 12;
                        else
-                               ad_bytes ^= 6;  /* switch between 2/4 bytes */
+                               /* switch between 2/4 bytes */
+                               c->ad_bytes = def_ad_bytes ^ 6;
                        break;
                case 0x2e:      /* CS override */
-                       override_base = &ctxt->cs_base;
+                       c->override_base = &ctxt->cs_base;
                        break;
                case 0x3e:      /* DS override */
-                       override_base = &ctxt->ds_base;
+                       c->override_base = &ctxt->ds_base;
                        break;
                case 0x26:      /* ES override */
-                       override_base = &ctxt->es_base;
+                       c->override_base = &ctxt->es_base;
                        break;
                case 0x64:      /* FS override */
-                       override_base = &ctxt->fs_base;
+                       c->override_base = &ctxt->fs_base;
                        break;
                case 0x65:      /* GS override */
-                       override_base = &ctxt->gs_base;
+                       c->override_base = &ctxt->gs_base;
                        break;
                case 0x36:      /* SS override */
-                       override_base = &ctxt->ss_base;
+                       c->override_base = &ctxt->ss_base;
                        break;
+               case 0x40 ... 0x4f: /* REX */
+                       if (mode != X86EMUL_MODE_PROT64)
+                               goto done_prefixes;
+                       c->rex_prefix = c->b;
+                       continue;
                case 0xf0:      /* LOCK */
-                       lock_prefix = 1;
-                       break;
-               case 0xf3:      /* REP/REPE/REPZ */
-                       rep_prefix = 1;
+                       c->lock_prefix = 1;
                        break;
                case 0xf2:      /* REPNE/REPNZ */
+                       c->rep_prefix = REPNE_PREFIX;
+                       break;
+               case 0xf3:      /* REP/REPE/REPZ */
+                       c->rep_prefix = REPE_PREFIX;
                        break;
                default:
                        goto done_prefixes;
                }
+
+               /* Any legacy prefix after a REX prefix nullifies its effect. */
+
+               c->rex_prefix = 0;
        }
 
 done_prefixes:
 
        /* REX prefix. */
-       if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
-               rex_prefix = b;
-               if (b & 8)
-                       op_bytes = 8;   /* REX.W */
-               modrm_reg = (b & 4) << 1;       /* REX.R */
-               index_reg = (b & 2) << 2; /* REX.X */
-               modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
-               b = insn_fetch(u8, 1, _eip);
-       }
+       if (c->rex_prefix)
+               if (c->rex_prefix & 8)
+                       c->op_bytes = 8;        /* REX.W */
 
        /* Opcode byte(s). */
-       d = opcode_table[b];
-       if (d == 0) {
+       c->d = opcode_table[c->b];
+       if (c->d == 0) {
                /* Two-byte opcode? */
-               if (b == 0x0f) {
-                       twobyte = 1;
-                       b = insn_fetch(u8, 1, _eip);
-                       d = twobyte_table[b];
+               if (c->b == 0x0f) {
+                       c->twobyte = 1;
+                       c->b = insn_fetch(u8, 1, c->eip);
+                       c->d = twobyte_table[c->b];
                }
 
                /* Unrecognised? */
-               if (d == 0)
-                       goto cannot_emulate;
+               if (c->d == 0) {
+                       DPRINTF("Cannot emulate %02x\n", c->b);
+                       return -1;
+               }
        }
 
        /* ModRM and SIB bytes. */
-       if (d & ModRM) {
-               modrm = insn_fetch(u8, 1, _eip);
-               modrm_mod |= (modrm & 0xc0) >> 6;
-               modrm_reg |= (modrm & 0x38) >> 3;
-               modrm_rm |= (modrm & 0x07);
-               modrm_ea = 0;
-               use_modrm_ea = 1;
-
-               if (modrm_mod == 3) {
-                       modrm_val = *(unsigned long *)
-                               decode_register(modrm_rm, _regs, d & ByteOp);
-                       goto modrm_done;
-               }
+       if (c->d & ModRM)
+               rc = decode_modrm(ctxt, ops);
+       else if (c->d & MemAbs)
+               rc = decode_abs(ctxt, ops);
+       if (rc)
+               goto done;
 
-               if (ad_bytes == 2) {
-                       unsigned bx = _regs[VCPU_REGS_RBX];
-                       unsigned bp = _regs[VCPU_REGS_RBP];
-                       unsigned si = _regs[VCPU_REGS_RSI];
-                       unsigned di = _regs[VCPU_REGS_RDI];
-
-                       /* 16-bit ModR/M decode. */
-                       switch (modrm_mod) {
-                       case 0:
-                               if (modrm_rm == 6)
-                                       modrm_ea += insn_fetch(u16, 2, _eip);
-                               break;
-                       case 1:
-                               modrm_ea += insn_fetch(s8, 1, _eip);
-                               break;
-                       case 2:
-                               modrm_ea += insn_fetch(u16, 2, _eip);
-                               break;
-                       }
-                       switch (modrm_rm) {
-                       case 0:
-                               modrm_ea += bx + si;
-                               break;
-                       case 1:
-                               modrm_ea += bx + di;
-                               break;
-                       case 2:
-                               modrm_ea += bp + si;
-                               break;
-                       case 3:
-                               modrm_ea += bp + di;
-                               break;
-                       case 4:
-                               modrm_ea += si;
-                               break;
-                       case 5:
-                               modrm_ea += di;
-                               break;
-                       case 6:
-                               if (modrm_mod != 0)
-                                       modrm_ea += bp;
-                               break;
-                       case 7:
-                               modrm_ea += bx;
-                               break;
-                       }
-                       if (modrm_rm == 2 || modrm_rm == 3 ||
-                           (modrm_rm == 6 && modrm_mod != 0))
-                               if (!override_base)
-                                       override_base = &ctxt->ss_base;
-                       modrm_ea = (u16)modrm_ea;
-               } else {
-                       /* 32/64-bit ModR/M decode. */
-                       switch (modrm_rm) {
-                       case 4:
-                       case 12:
-                               sib = insn_fetch(u8, 1, _eip);
-                               index_reg |= (sib >> 3) & 7;
-                               base_reg |= sib & 7;
-                               scale = sib >> 6;
-
-                               switch (base_reg) {
-                               case 5:
-                                       if (modrm_mod != 0)
-                                               modrm_ea += _regs[base_reg];
-                                       else
-                                               modrm_ea += insn_fetch(s32, 4, _eip);
-                                       break;
-                               default:
-                                       modrm_ea += _regs[base_reg];
-                               }
-                               switch (index_reg) {
-                               case 4:
-                                       break;
-                               default:
-                                       modrm_ea += _regs[index_reg] << scale;
-
-                               }
-                               break;
-                       case 5:
-                               if (modrm_mod != 0)
-                                       modrm_ea += _regs[modrm_rm];
-                               else if (mode == X86EMUL_MODE_PROT64)
-                                       rip_relative = 1;
-                               break;
-                       default:
-                               modrm_ea += _regs[modrm_rm];
-                               break;
-                       }
-                       switch (modrm_mod) {
-                       case 0:
-                               if (modrm_rm == 5)
-                                       modrm_ea += insn_fetch(s32, 4, _eip);
-                               break;
-                       case 1:
-                               modrm_ea += insn_fetch(s8, 1, _eip);
-                               break;
-                       case 2:
-                               modrm_ea += insn_fetch(s32, 4, _eip);
-                               break;
-                       }
-               }
-               if (!override_base)
-                       override_base = &ctxt->ds_base;
-               if (mode == X86EMUL_MODE_PROT64 &&
-                   override_base != &ctxt->fs_base &&
-                   override_base != &ctxt->gs_base)
-                       override_base = NULL;
-
-               if (override_base)
-                       modrm_ea += *override_base;
-
-               if (rip_relative) {
-                       modrm_ea += _eip;
-                       switch (d & SrcMask) {
-                       case SrcImmByte:
-                               modrm_ea += 1;
-                               break;
-                       case SrcImm:
-                               if (d & ByteOp)
-                                       modrm_ea += 1;
-                               else
-                                       if (op_bytes == 8)
-                                               modrm_ea += 4;
-                                       else
-                                               modrm_ea += op_bytes;
-                       }
-               }
-               if (ad_bytes != 8)
-                       modrm_ea = (u32)modrm_ea;
-               cr2 = modrm_ea;
-       modrm_done:
-               ;
-       }
+       if (!c->override_base)
+               c->override_base = &ctxt->ds_base;
+       if (mode == X86EMUL_MODE_PROT64 &&
+           c->override_base != &ctxt->fs_base &&
+           c->override_base != &ctxt->gs_base)
+               c->override_base = NULL;
+
+       if (c->override_base)
+               c->modrm_ea += *c->override_base;
 
+       if (c->ad_bytes != 8)
+               c->modrm_ea = (u32)c->modrm_ea;
        /*
         * Decode and fetch the source operand: register, memory
         * or immediate.
         */
-       switch (d & SrcMask) {
+       switch (c->d & SrcMask) {
        case SrcNone:
                break;
        case SrcReg:
-               src.type = OP_REG;
-               if (d & ByteOp) {
-                       src.ptr = decode_register(modrm_reg, _regs,
-                                                 (rex_prefix == 0));
-                       src.val = src.orig_val = *(u8 *) src.ptr;
-                       src.bytes = 1;
-               } else {
-                       src.ptr = decode_register(modrm_reg, _regs, 0);
-                       switch ((src.bytes = op_bytes)) {
-                       case 2:
-                               src.val = src.orig_val = *(u16 *) src.ptr;
-                               break;
-                       case 4:
-                               src.val = src.orig_val = *(u32 *) src.ptr;
-                               break;
-                       case 8:
-                               src.val = src.orig_val = *(u64 *) src.ptr;
-                               break;
-                       }
-               }
+               decode_register_operand(&c->src, c, 0);
                break;
        case SrcMem16:
-               src.bytes = 2;
+               c->src.bytes = 2;
                goto srcmem_common;
        case SrcMem32:
-               src.bytes = 4;
+               c->src.bytes = 4;
                goto srcmem_common;
        case SrcMem:
-               src.bytes = (d & ByteOp) ? 1 : op_bytes;
+               c->src.bytes = (c->d & ByteOp) ? 1 :
+                                                          c->op_bytes;
                /* Don't fetch the address for invlpg: it could be unmapped. */
-               if (twobyte && b == 0x01 && modrm_reg == 7)
+               if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
                        break;
-             srcmem_common:
-               src.type = OP_MEM;
-               src.ptr = (unsigned long *)cr2;
-               if ((rc = ops->read_emulated((unsigned long)src.ptr,
-                                            &src.val, src.bytes, ctxt->vcpu)) != 0)
-                       goto done;
-               src.orig_val = src.val;
+       srcmem_common:
+               /*
+                * For instructions with a ModR/M byte, switch to register
+                * access if Mod = 3.
+                */
+               if ((c->d & ModRM) && c->modrm_mod == 3) {
+                       c->src.type = OP_REG;
+                       break;
+               }
+               c->src.type = OP_MEM;
                break;
        case SrcImm:
-               src.type = OP_IMM;
-               src.ptr = (unsigned long *)_eip;
-               src.bytes = (d & ByteOp) ? 1 : op_bytes;
-               if (src.bytes == 8)
-                       src.bytes = 4;
+               c->src.type = OP_IMM;
+               c->src.ptr = (unsigned long *)c->eip;
+               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               if (c->src.bytes == 8)
+                       c->src.bytes = 4;
                /* NB. Immediates are sign-extended as necessary. */
-               switch (src.bytes) {
+               switch (c->src.bytes) {
                case 1:
-                       src.val = insn_fetch(s8, 1, _eip);
+                       c->src.val = insn_fetch(s8, 1, c->eip);
                        break;
                case 2:
-                       src.val = insn_fetch(s16, 2, _eip);
+                       c->src.val = insn_fetch(s16, 2, c->eip);
                        break;
                case 4:
-                       src.val = insn_fetch(s32, 4, _eip);
+                       c->src.val = insn_fetch(s32, 4, c->eip);
                        break;
                }
                break;
        case SrcImmByte:
-               src.type = OP_IMM;
-               src.ptr = (unsigned long *)_eip;
-               src.bytes = 1;
-               src.val = insn_fetch(s8, 1, _eip);
+               c->src.type = OP_IMM;
+               c->src.ptr = (unsigned long *)c->eip;
+               c->src.bytes = 1;
+               c->src.val = insn_fetch(s8, 1, c->eip);
                break;
        }
 
        /* Decode and fetch the destination operand: register or memory. */
-       switch (d & DstMask) {
+       switch (c->d & DstMask) {
        case ImplicitOps:
                /* Special instructions do their own operand decoding. */
-               goto special_insn;
+               return 0;
        case DstReg:
-               dst.type = OP_REG;
-               if ((d & ByteOp)
-                   && !(twobyte && (b == 0xb6 || b == 0xb7))) {
-                       dst.ptr = decode_register(modrm_reg, _regs,
-                                                 (rex_prefix == 0));
-                       dst.val = *(u8 *) dst.ptr;
-                       dst.bytes = 1;
-               } else {
-                       dst.ptr = decode_register(modrm_reg, _regs, 0);
-                       switch ((dst.bytes = op_bytes)) {
-                       case 2:
-                               dst.val = *(u16 *)dst.ptr;
-                               break;
-                       case 4:
-                               dst.val = *(u32 *)dst.ptr;
-                               break;
-                       case 8:
-                               dst.val = *(u64 *)dst.ptr;
-                               break;
-                       }
-               }
+               decode_register_operand(&c->dst, c,
+                        c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
                break;
        case DstMem:
-               dst.type = OP_MEM;
-               dst.ptr = (unsigned long *)cr2;
-               dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-               if (d & BitOp) {
-                       unsigned long mask = ~(dst.bytes * 8 - 1);
+               if ((c->d & ModRM) && c->modrm_mod == 3) {
+                       c->dst.type = OP_REG;
+                       break;
+               }
+               c->dst.type = OP_MEM;
+               break;
+       }
+
+done:
+       return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+{
+       struct decode_cache *c = &ctxt->decode;
+
+       c->dst.type  = OP_MEM;
+       c->dst.bytes = c->op_bytes;
+       c->dst.val = c->src.val;
+       register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
+       c->dst.ptr = (void *) register_address(ctxt->ss_base,
+                                              c->regs[VCPU_REGS_RSP]);
+}
+
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+                               struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc;
+
+       /* 64-bit mode: POP always pops a 64-bit operand. */
 
-                       dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
+       if (ctxt->mode == X86EMUL_MODE_PROT64)
+               c->dst.bytes = 8;
+
+       rc = ops->read_std(register_address(ctxt->ss_base,
+                                           c->regs[VCPU_REGS_RSP]),
+                          &c->dst.val, c->dst.bytes, ctxt->vcpu);
+       if (rc != 0)
+               return rc;
+
+       register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
+
+       return 0;
+}
+
+static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+{
+       struct decode_cache *c = &ctxt->decode;
+       switch (c->modrm_reg) {
+       case 0: /* rol */
+               emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
+               break;
+       case 1: /* ror */
+               emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
+               break;
+       case 2: /* rcl */
+               emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
+               break;
+       case 3: /* rcr */
+               emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
+               break;
+       case 4: /* sal/shl */
+       case 6: /* sal/shl */
+               emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
+               break;
+       case 5: /* shr */
+               emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
+               break;
+       case 7: /* sar */
+               emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
+               break;
+       }
+}
+
+static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
+                              struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+
+       switch (c->modrm_reg) {
+       case 0 ... 1:   /* test */
+               /*
+                * Special case in Grp3: test has an immediate
+                * source operand.
+                */
+               c->src.type = OP_IMM;
+               c->src.ptr = (unsigned long *)c->eip;
+               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               if (c->src.bytes == 8)
+                       c->src.bytes = 4;
+               switch (c->src.bytes) {
+               case 1:
+                       c->src.val = insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->src.val = insn_fetch(s16, 2, c->eip);
+                       break;
+               case 4:
+                       c->src.val = insn_fetch(s32, 4, c->eip);
+                       break;
                }
-               if (!(d & Mov) && /* optimisation - avoid slow emulated read */
-                   ((rc = ops->read_emulated((unsigned long)dst.ptr,
-                                             &dst.val, dst.bytes, ctxt->vcpu)) != 0))
-                       goto done;
+               emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+               break;
+       case 2: /* not */
+               c->dst.val = ~c->dst.val;
+               break;
+       case 3: /* neg */
+               emulate_1op("neg", c->dst, ctxt->eflags);
+               break;
+       default:
+               DPRINTF("Cannot emulate %02x\n", c->b);
+               rc = X86EMUL_UNHANDLEABLE;
                break;
        }
-       dst.orig_val = dst.val;
+done:
+       return rc;
+}
 
-       if (twobyte)
+static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
+                              struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc;
+
+       switch (c->modrm_reg) {
+       case 0: /* inc */
+               emulate_1op("inc", c->dst, ctxt->eflags);
+               break;
+       case 1: /* dec */
+               emulate_1op("dec", c->dst, ctxt->eflags);
+               break;
+       case 4: /* jmp abs */
+               if (c->b == 0xff)
+                       c->eip = c->dst.val;
+               else {
+                       DPRINTF("Cannot emulate %02x\n", c->b);
+                       return X86EMUL_UNHANDLEABLE;
+               }
+               break;
+       case 6: /* push */
+
+               /* 64-bit mode: PUSH always pushes a 64-bit operand. */
+
+               if (ctxt->mode == X86EMUL_MODE_PROT64) {
+                       c->dst.bytes = 8;
+                       rc = ops->read_std((unsigned long)c->dst.ptr,
+                                          &c->dst.val, 8, ctxt->vcpu);
+                       if (rc != 0)
+                               return rc;
+               }
+               register_address_increment(c->regs[VCPU_REGS_RSP],
+                                          -c->dst.bytes);
+               rc = ops->write_emulated(register_address(ctxt->ss_base,
+                                   c->regs[VCPU_REGS_RSP]), &c->dst.val,
+                                   c->dst.bytes, ctxt->vcpu);
+               if (rc != 0)
+                       return rc;
+               c->dst.type = OP_NONE;
+               break;
+       default:
+               DPRINTF("Cannot emulate %02x\n", c->b);
+               return X86EMUL_UNHANDLEABLE;
+       }
+       return 0;
+}
+
+static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
+                              struct x86_emulate_ops *ops,
+                              unsigned long cr2)
+{
+       struct decode_cache *c = &ctxt->decode;
+       u64 old, new;
+       int rc;
+
+       rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu);
+       if (rc != 0)
+               return rc;
+
+       if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
+           ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
+
+               c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
+               c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+               ctxt->eflags &= ~EFLG_ZF;
+
+       } else {
+               new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+                      (u32) c->regs[VCPU_REGS_RBX];
+
+               rc = ops->cmpxchg_emulated(cr2, &old, &new, 8, ctxt->vcpu);
+               if (rc != 0)
+                       return rc;
+               ctxt->eflags |= EFLG_ZF;
+       }
+       return 0;
+}
+
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+                           struct x86_emulate_ops *ops)
+{
+       int rc;
+       struct decode_cache *c = &ctxt->decode;
+
+       switch (c->dst.type) {
+       case OP_REG:
+               /* The 4-byte case *is* correct:
+                * in 64-bit mode we zero-extend.
+                */
+               switch (c->dst.bytes) {
+               case 1:
+                       *(u8 *)c->dst.ptr = (u8)c->dst.val;
+                       break;
+               case 2:
+                       *(u16 *)c->dst.ptr = (u16)c->dst.val;
+                       break;
+               case 4:
+                       *c->dst.ptr = (u32)c->dst.val;
+                       break;  /* 64b: zero-ext */
+               case 8:
+                       *c->dst.ptr = c->dst.val;
+                       break;
+               }
+               break;
+       case OP_MEM:
+               if (c->lock_prefix)
+                       rc = ops->cmpxchg_emulated(
+                                       (unsigned long)c->dst.ptr,
+                                       &c->dst.orig_val,
+                                       &c->dst.val,
+                                       c->dst.bytes,
+                                       ctxt->vcpu);
+               else
+                       rc = ops->write_emulated(
+                                       (unsigned long)c->dst.ptr,
+                                       &c->dst.val,
+                                       c->dst.bytes,
+                                       ctxt->vcpu);
+               if (rc != 0)
+                       return rc;
+               break;
+       case OP_NONE:
+               /* no writeback */
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
+int
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+       unsigned long cr2 = ctxt->cr2;
+       u64 msr_data;
+       unsigned long saved_eip = 0;
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+
+       /* Shadow copy of register state. Committed on successful emulation.
+        * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
+        * modify them.
+        */
+
+       memcpy(c->regs, ctxt->vcpu->regs, sizeof c->regs);
+       saved_eip = c->eip;
+
+       if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
+               cr2 = c->modrm_ea;
+
+       if (c->src.type == OP_MEM) {
+               c->src.ptr = (unsigned long *)cr2;
+               c->src.val = 0;
+               rc = ops->read_emulated((unsigned long)c->src.ptr,
+                                       &c->src.val,
+                                       c->src.bytes,
+                                       ctxt->vcpu);
+               if (rc != 0)
+                       goto done;
+               c->src.orig_val = c->src.val;
+       }
+
+       if ((c->d & DstMask) == ImplicitOps)
+               goto special_insn;
+
+
+       if (c->dst.type == OP_MEM) {
+               c->dst.ptr = (unsigned long *)cr2;
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.val = 0;
+               if (c->d & BitOp) {
+                       unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+                       c->dst.ptr = (void *)c->dst.ptr +
+                                                  (c->src.val & mask) / 8;
+               }
+               if (!(c->d & Mov) &&
+                                  /* optimisation - avoid slow emulated read */
+                   ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
+                                          &c->dst.val,
+                                         c->dst.bytes, ctxt->vcpu)) != 0))
+                       goto done;
+       }
+       c->dst.orig_val = c->dst.val;
+
+       if (c->twobyte)
                goto twobyte_insn;
 
-       switch (b) {
+       switch (c->b) {
        case 0x00 ... 0x05:
              add:              /* add */
-               emulate_2op_SrcV("add", src, dst, _eflags);
+               emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
                break;
        case 0x08 ... 0x0d:
              or:               /* or */
-               emulate_2op_SrcV("or", src, dst, _eflags);
+               emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
                break;
        case 0x10 ... 0x15:
              adc:              /* adc */
-               emulate_2op_SrcV("adc", src, dst, _eflags);
+               emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
                break;
        case 0x18 ... 0x1d:
              sbb:              /* sbb */
-               emulate_2op_SrcV("sbb", src, dst, _eflags);
+               emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
                break;
        case 0x20 ... 0x23:
              and:              /* and */
-               emulate_2op_SrcV("and", src, dst, _eflags);
+               emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
                break;
        case 0x24:              /* and al imm8 */
-               dst.type = OP_REG;
-               dst.ptr = &_regs[VCPU_REGS_RAX];
-               dst.val = *(u8 *)dst.ptr;
-               dst.bytes = 1;
-               dst.orig_val = dst.val;
+               c->dst.type = OP_REG;
+               c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+               c->dst.val = *(u8 *)c->dst.ptr;
+               c->dst.bytes = 1;
+               c->dst.orig_val = c->dst.val;
                goto and;
        case 0x25:              /* and ax imm16, or eax imm32 */
-               dst.type = OP_REG;
-               dst.bytes = op_bytes;
-               dst.ptr = &_regs[VCPU_REGS_RAX];
-               if (op_bytes == 2)
-                       dst.val = *(u16 *)dst.ptr;
+               c->dst.type = OP_REG;
+               c->dst.bytes = c->op_bytes;
+               c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+               if (c->op_bytes == 2)
+                       c->dst.val = *(u16 *)c->dst.ptr;
                else
-                       dst.val = *(u32 *)dst.ptr;
-               dst.orig_val = dst.val;
+                       c->dst.val = *(u32 *)c->dst.ptr;
+               c->dst.orig_val = c->dst.val;
                goto and;
        case 0x28 ... 0x2d:
              sub:              /* sub */
-               emulate_2op_SrcV("sub", src, dst, _eflags);
+               emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
                break;
        case 0x30 ... 0x35:
              xor:              /* xor */
-               emulate_2op_SrcV("xor", src, dst, _eflags);
+               emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
                break;
        case 0x38 ... 0x3d:
              cmp:              /* cmp */
-               emulate_2op_SrcV("cmp", src, dst, _eflags);
+               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x40 ... 0x47: /* inc r16/r32 */
+               emulate_1op("inc", c->dst, ctxt->eflags);
+               break;
+       case 0x48 ... 0x4f: /* dec r16/r32 */
+               emulate_1op("dec", c->dst, ctxt->eflags);
+               break;
+       case 0x50 ... 0x57:  /* push reg */
+               c->dst.type  = OP_MEM;
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = c->src.val;
+               register_address_increment(c->regs[VCPU_REGS_RSP],
+                                          -c->op_bytes);
+               c->dst.ptr = (void *) register_address(
+                       ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
+               break;
+       case 0x58 ... 0x5f: /* pop reg */
+       pop_instruction:
+               if ((rc = ops->read_std(register_address(ctxt->ss_base,
+                       c->regs[VCPU_REGS_RSP]), c->dst.ptr,
+                       c->op_bytes, ctxt->vcpu)) != 0)
+                       goto done;
+
+               register_address_increment(c->regs[VCPU_REGS_RSP],
+                                          c->op_bytes);
+               c->dst.type = OP_NONE;  /* Disable writeback. */
                break;
        case 0x63:              /* movsxd */
-               if (mode != X86EMUL_MODE_PROT64)
+               if (ctxt->mode != X86EMUL_MODE_PROT64)
                        goto cannot_emulate;
-               dst.val = (s32) src.val;
-               break;
-       case 0x6a: /* push imm8 */
-               src.val = 0L;
-               src.val = insn_fetch(s8, 1, _eip);
-push:
-               dst.type  = OP_MEM;
-               dst.bytes = op_bytes;
-               dst.val = src.val;
-               register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
-               dst.ptr = (void *) register_address(ctxt->ss_base,
-                                                       _regs[VCPU_REGS_RSP]);
+               c->dst.val = (s32) c->src.val;
                break;
        case 0x80 ... 0x83:     /* Grp1 */
-               switch (modrm_reg) {
+               switch (c->modrm_reg) {
                case 0:
                        goto add;
                case 1:
@@ -994,388 +1368,313 @@ push:
                }
                break;
        case 0x84 ... 0x85:
-             test:             /* test */
-               emulate_2op_SrcV("test", src, dst, _eflags);
+               emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
                break;
        case 0x86 ... 0x87:     /* xchg */
                /* Write back the register source. */
-               switch (dst.bytes) {
+               switch (c->dst.bytes) {
                case 1:
-                       *(u8 *) src.ptr = (u8) dst.val;
+                       *(u8 *) c->src.ptr = (u8) c->dst.val;
                        break;
                case 2:
-                       *(u16 *) src.ptr = (u16) dst.val;
+                       *(u16 *) c->src.ptr = (u16) c->dst.val;
                        break;
                case 4:
-                       *src.ptr = (u32) dst.val;
+                       *c->src.ptr = (u32) c->dst.val;
                        break;  /* 64b reg: zero-extend */
                case 8:
-                       *src.ptr = dst.val;
+                       *c->src.ptr = c->dst.val;
                        break;
                }
                /*
                 * Write back the memory destination with implicit LOCK
                 * prefix.
                 */
-               dst.val = src.val;
-               lock_prefix = 1;
+               c->dst.val = c->src.val;
+               c->lock_prefix = 1;
                break;
        case 0x88 ... 0x8b:     /* mov */
                goto mov;
        case 0x8d: /* lea r16/r32, m */
-               dst.val = modrm_val;
+               c->dst.val = c->modrm_val;
                break;
        case 0x8f:              /* pop (sole member of Grp1a) */
-               /* 64-bit mode: POP always pops a 64-bit operand. */
-               if (mode == X86EMUL_MODE_PROT64)
-                       dst.bytes = 8;
-               if ((rc = ops->read_std(register_address(ctxt->ss_base,
-                                                        _regs[VCPU_REGS_RSP]),
-                                       &dst.val, dst.bytes, ctxt->vcpu)) != 0)
+               rc = emulate_grp1a(ctxt, ops);
+               if (rc != 0)
                        goto done;
-               register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
                break;
        case 0xa0 ... 0xa1:     /* mov */
-               dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
-               dst.val = src.val;
-               _eip += ad_bytes;       /* skip src displacement */
+               c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+               c->dst.val = c->src.val;
                break;
        case 0xa2 ... 0xa3:     /* mov */
-               dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
-               _eip += ad_bytes;       /* skip dst displacement */
+               c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
                break;
        case 0xc0 ... 0xc1:
-             grp2:             /* Grp2 */
-               switch (modrm_reg) {
-               case 0: /* rol */
-                       emulate_2op_SrcB("rol", src, dst, _eflags);
-                       break;
-               case 1: /* ror */
-                       emulate_2op_SrcB("ror", src, dst, _eflags);
-                       break;
-               case 2: /* rcl */
-                       emulate_2op_SrcB("rcl", src, dst, _eflags);
-                       break;
-               case 3: /* rcr */
-                       emulate_2op_SrcB("rcr", src, dst, _eflags);
-                       break;
-               case 4: /* sal/shl */
-               case 6: /* sal/shl */
-                       emulate_2op_SrcB("sal", src, dst, _eflags);
-                       break;
-               case 5: /* shr */
-                       emulate_2op_SrcB("shr", src, dst, _eflags);
-                       break;
-               case 7: /* sar */
-                       emulate_2op_SrcB("sar", src, dst, _eflags);
-                       break;
-               }
+               emulate_grp2(ctxt);
                break;
        case 0xc6 ... 0xc7:     /* mov (sole member of Grp11) */
        mov:
-               dst.val = src.val;
+               c->dst.val = c->src.val;
                break;
        case 0xd0 ... 0xd1:     /* Grp2 */
-               src.val = 1;
-               goto grp2;
+               c->src.val = 1;
+               emulate_grp2(ctxt);
+               break;
        case 0xd2 ... 0xd3:     /* Grp2 */
-               src.val = _regs[VCPU_REGS_RCX];
-               goto grp2;
-       case 0xe8: /* call (near) */ {
-               long int rel;
-               switch (op_bytes) {
-               case 2:
-                       rel = insn_fetch(s16, 2, _eip);
-                       break;
-               case 4:
-                       rel = insn_fetch(s32, 4, _eip);
-                       break;
-               case 8:
-                       rel = insn_fetch(s64, 8, _eip);
-                       break;
-               default:
-                       DPRINTF("Call: Invalid op_bytes\n");
-                       goto cannot_emulate;
-               }
-               src.val = (unsigned long) _eip;
-               JMP_REL(rel);
-               goto push;
-       }
-       case 0xe9: /* jmp rel */
-       case 0xeb: /* jmp rel short */
-               JMP_REL(src.val);
-               no_wb = 1; /* Disable writeback. */
+               c->src.val = c->regs[VCPU_REGS_RCX];
+               emulate_grp2(ctxt);
                break;
        case 0xf6 ... 0xf7:     /* Grp3 */
-               switch (modrm_reg) {
-               case 0 ... 1:   /* test */
-                       /*
-                        * Special case in Grp3: test has an immediate
-                        * source operand.
-                        */
-                       src.type = OP_IMM;
-                       src.ptr = (unsigned long *)_eip;
-                       src.bytes = (d & ByteOp) ? 1 : op_bytes;
-                       if (src.bytes == 8)
-                               src.bytes = 4;
-                       switch (src.bytes) {
-                       case 1:
-                               src.val = insn_fetch(s8, 1, _eip);
-                               break;
-                       case 2:
-                               src.val = insn_fetch(s16, 2, _eip);
-                               break;
-                       case 4:
-                               src.val = insn_fetch(s32, 4, _eip);
-                               break;
-                       }
-                       goto test;
-               case 2: /* not */
-                       dst.val = ~dst.val;
-                       break;
-               case 3: /* neg */
-                       emulate_1op("neg", dst, _eflags);
-                       break;
-               default:
-                       goto cannot_emulate;
-               }
+               rc = emulate_grp3(ctxt, ops);
+               if (rc != 0)
+                       goto done;
                break;
        case 0xfe ... 0xff:     /* Grp4/Grp5 */
-               switch (modrm_reg) {
-               case 0: /* inc */
-                       emulate_1op("inc", dst, _eflags);
-                       break;
-               case 1: /* dec */
-                       emulate_1op("dec", dst, _eflags);
-                       break;
-               case 6: /* push */
-                       /* 64-bit mode: PUSH always pushes a 64-bit operand. */
-                       if (mode == X86EMUL_MODE_PROT64) {
-                               dst.bytes = 8;
-                               if ((rc = ops->read_std((unsigned long)dst.ptr,
-                                                       &dst.val, 8,
-                                                       ctxt->vcpu)) != 0)
-                                       goto done;
-                       }
-                       register_address_increment(_regs[VCPU_REGS_RSP],
-                                                  -dst.bytes);
-                       if ((rc = ops->write_std(
-                                    register_address(ctxt->ss_base,
-                                                     _regs[VCPU_REGS_RSP]),
-                                    &dst.val, dst.bytes, ctxt->vcpu)) != 0)
-                               goto done;
-                       no_wb = 1;
-                       break;
-               default:
-                       goto cannot_emulate;
-               }
+               rc = emulate_grp45(ctxt, ops);
+               if (rc != 0)
+                       goto done;
                break;
        }
 
 writeback:
-       if (!no_wb) {
-               switch (dst.type) {
-               case OP_REG:
-                       /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
-                       switch (dst.bytes) {
-                       case 1:
-                               *(u8 *)dst.ptr = (u8)dst.val;
-                               break;
-                       case 2:
-                               *(u16 *)dst.ptr = (u16)dst.val;
-                               break;
-                       case 4:
-                               *dst.ptr = (u32)dst.val;
-                               break;  /* 64b: zero-ext */
-                       case 8:
-                               *dst.ptr = dst.val;
-                               break;
-                       }
-                       break;
-               case OP_MEM:
-                       if (lock_prefix)
-                               rc = ops->cmpxchg_emulated((unsigned long)dst.
-                                                          ptr, &dst.orig_val,
-                                                          &dst.val, dst.bytes,
-                                                          ctxt->vcpu);
-                       else
-                               rc = ops->write_emulated((unsigned long)dst.ptr,
-                                                        &dst.val, dst.bytes,
-                                                        ctxt->vcpu);
-                       if (rc != 0)
-                               goto done;
-               default:
-                       break;
-               }
-       }
+       rc = writeback(ctxt, ops);
+       if (rc != 0)
+               goto done;
 
        /* Commit shadow register state. */
-       memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
-       ctxt->eflags = _eflags;
-       ctxt->vcpu->rip = _eip;
+       memcpy(ctxt->vcpu->regs, c->regs, sizeof c->regs);
+       ctxt->vcpu->rip = c->eip;
 
 done:
-       return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+       if (rc == X86EMUL_UNHANDLEABLE) {
+               c->eip = saved_eip;
+               return -1;
+       }
+       return 0;
 
 special_insn:
-       if (twobyte)
+       if (c->twobyte)
                goto twobyte_special_insn;
-       switch(b) {
-       case 0x50 ... 0x57:  /* push reg */
-               if (op_bytes == 2)
-                       src.val = (u16) _regs[b & 0x7];
-               else
-                       src.val = (u32) _regs[b & 0x7];
-               dst.type  = OP_MEM;
-               dst.bytes = op_bytes;
-               dst.val = src.val;
-               register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
-               dst.ptr = (void *) register_address(
-                       ctxt->ss_base, _regs[VCPU_REGS_RSP]);
-               break;
-       case 0x58 ... 0x5f: /* pop reg */
-               dst.ptr = (unsigned long *)&_regs[b & 0x7];
-       pop_instruction:
-               if ((rc = ops->read_std(register_address(ctxt->ss_base,
-                       _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
-                       != 0)
-                       goto done;
-
-               register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
-               no_wb = 1; /* Disable writeback. */
+       switch (c->b) {
+       case 0x6a: /* push imm8 */
+               c->src.val = 0L;
+               c->src.val = insn_fetch(s8, 1, c->eip);
+               emulate_push(ctxt);
                break;
        case 0x6c:              /* insb */
        case 0x6d:              /* insw/insd */
                 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
-                               1,                                      /* in */
-                               (d & ByteOp) ? 1 : op_bytes,            /* size */
-                               rep_prefix ?
-                               address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
-                               (_eflags & EFLG_DF),                    /* down */
+                               1,
+                               (c->d & ByteOp) ? 1 : c->op_bytes,
+                               c->rep_prefix ?
+                               address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+                               (ctxt->eflags & EFLG_DF),
                                register_address(ctxt->es_base,
-                                                _regs[VCPU_REGS_RDI]), /* address */
-                               rep_prefix,
-                               _regs[VCPU_REGS_RDX]                    /* port */
-                               ) == 0)
+                                                c->regs[VCPU_REGS_RDI]),
+                               c->rep_prefix,
+                               c->regs[VCPU_REGS_RDX]) == 0) {
+                       c->eip = saved_eip;
                        return -1;
+               }
                return 0;
        case 0x6e:              /* outsb */
        case 0x6f:              /* outsw/outsd */
                if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
-                               0,                                      /* in */
-                               (d & ByteOp) ? 1 : op_bytes,            /* size */
-                               rep_prefix ?
-                               address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
-                               (_eflags & EFLG_DF),                    /* down */
-                               register_address(override_base ?
-                                                *override_base : ctxt->ds_base,
-                                                _regs[VCPU_REGS_RSI]), /* address */
-                               rep_prefix,
-                               _regs[VCPU_REGS_RDX]                    /* port */
-                               ) == 0)
+                               0,
+                               (c->d & ByteOp) ? 1 : c->op_bytes,
+                               c->rep_prefix ?
+                               address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+                               (ctxt->eflags & EFLG_DF),
+                               register_address(c->override_base ?
+                                                       *c->override_base :
+                                                       ctxt->ds_base,
+                                                c->regs[VCPU_REGS_RSI]),
+                               c->rep_prefix,
+                               c->regs[VCPU_REGS_RDX]) == 0) {
+                       c->eip = saved_eip;
                        return -1;
+               }
                return 0;
        case 0x70 ... 0x7f: /* jcc (short) */ {
-               int rel = insn_fetch(s8, 1, _eip);
+               int rel = insn_fetch(s8, 1, c->eip);
 
-               if (test_cc(b, _eflags))
+               if (test_cc(c->b, ctxt->eflags))
                JMP_REL(rel);
                break;
        }
        case 0x9c: /* pushf */
-               src.val =  (unsigned long) _eflags;
-               goto push;
+               c->src.val =  (unsigned long) ctxt->eflags;
+               emulate_push(ctxt);
+               break;
+       case 0x9d: /* popf */
+               c->dst.ptr = (unsigned long *) &ctxt->eflags;
+               goto pop_instruction;
        case 0xc3: /* ret */
-               dst.ptr = &_eip;
+               c->dst.ptr = &c->eip;
                goto pop_instruction;
        case 0xf4:              /* hlt */
                ctxt->vcpu->halt_request = 1;
                goto done;
+       case 0xf5:      /* cmc */
+               /* complement carry flag from eflags reg */
+               ctxt->eflags ^= EFLG_CF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xf8: /* clc */
+               ctxt->eflags &= ~EFLG_CF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xfa: /* cli */
+               ctxt->eflags &= ~X86_EFLAGS_IF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xfb: /* sti */
+               ctxt->eflags |= X86_EFLAGS_IF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
        }
-       if (rep_prefix) {
-               if (_regs[VCPU_REGS_RCX] == 0) {
-                       ctxt->vcpu->rip = _eip;
+       if (c->rep_prefix) {
+               if (c->regs[VCPU_REGS_RCX] == 0) {
+                       ctxt->vcpu->rip = c->eip;
                        goto done;
                }
-               _regs[VCPU_REGS_RCX]--;
-               _eip = ctxt->vcpu->rip;
+               c->regs[VCPU_REGS_RCX]--;
+               c->eip = ctxt->vcpu->rip;
        }
-       switch (b) {
+       switch (c->b) {
        case 0xa4 ... 0xa5:     /* movs */
-               dst.type = OP_MEM;
-               dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-               dst.ptr = (unsigned long *)register_address(ctxt->es_base,
-                                                       _regs[VCPU_REGS_RDI]);
+               c->dst.type = OP_MEM;
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.ptr = (unsigned long *)register_address(
+                                                  ctxt->es_base,
+                                                  c->regs[VCPU_REGS_RDI]);
                if ((rc = ops->read_emulated(register_address(
-                     override_base ? *override_base : ctxt->ds_base,
-                     _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
+                     c->override_base ? *c->override_base :
+                                       ctxt->ds_base,
+                                       c->regs[VCPU_REGS_RSI]),
+                                       &c->dst.val,
+                                       c->dst.bytes, ctxt->vcpu)) != 0)
                        goto done;
-               register_address_increment(_regs[VCPU_REGS_RSI],
-                            (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
-               register_address_increment(_regs[VCPU_REGS_RDI],
-                            (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+               register_address_increment(c->regs[VCPU_REGS_RSI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                          : c->dst.bytes);
+               register_address_increment(c->regs[VCPU_REGS_RDI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                          : c->dst.bytes);
                break;
        case 0xa6 ... 0xa7:     /* cmps */
                DPRINTF("Urk! I don't handle CMPS.\n");
                goto cannot_emulate;
        case 0xaa ... 0xab:     /* stos */
-               dst.type = OP_MEM;
-               dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-               dst.ptr = (unsigned long *)cr2;
-               dst.val = _regs[VCPU_REGS_RAX];
-               register_address_increment(_regs[VCPU_REGS_RDI],
-                            (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+               c->dst.type = OP_MEM;
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.ptr = (unsigned long *)register_address(
+                                                  ctxt->es_base,
+                                                  c->regs[VCPU_REGS_RDI]);
+               c->dst.val = c->regs[VCPU_REGS_RAX];
+               register_address_increment(c->regs[VCPU_REGS_RDI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                          : c->dst.bytes);
                break;
        case 0xac ... 0xad:     /* lods */
-               dst.type = OP_REG;
-               dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-               dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
-               if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
-                                            ctxt->vcpu)) != 0)
+               c->dst.type = OP_REG;
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+               if ((rc = ops->read_emulated(register_address(
+                               c->override_base ? *c->override_base :
+                                                  ctxt->ds_base,
+                                                c->regs[VCPU_REGS_RSI]),
+                                                &c->dst.val,
+                                                c->dst.bytes,
+                                                ctxt->vcpu)) != 0)
                        goto done;
-               register_address_increment(_regs[VCPU_REGS_RSI],
-                          (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
+               register_address_increment(c->regs[VCPU_REGS_RSI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                          : c->dst.bytes);
                break;
        case 0xae ... 0xaf:     /* scas */
                DPRINTF("Urk! I don't handle SCAS.\n");
                goto cannot_emulate;
+       case 0xe8: /* call (near) */ {
+               long int rel;
+               switch (c->op_bytes) {
+               case 2:
+                       rel = insn_fetch(s16, 2, c->eip);
+                       break;
+               case 4:
+                       rel = insn_fetch(s32, 4, c->eip);
+                       break;
+               default:
+                       DPRINTF("Call: Invalid op_bytes\n");
+                       goto cannot_emulate;
+               }
+               c->src.val = (unsigned long) c->eip;
+               JMP_REL(rel);
+               c->op_bytes = c->ad_bytes;
+               emulate_push(ctxt);
+               break;
+       }
+       case 0xe9: /* jmp rel */
+       case 0xeb: /* jmp rel short */
+               JMP_REL(c->src.val);
+               c->dst.type = OP_NONE; /* Disable writeback. */
+               break;
+
 
        }
        goto writeback;
 
 twobyte_insn:
-       switch (b) {
+       switch (c->b) {
        case 0x01: /* lgdt, lidt, lmsw */
-               /* Disable writeback. */
-               no_wb = 1;
-               switch (modrm_reg) {
+               switch (c->modrm_reg) {
                        u16 size;
                        unsigned long address;
 
-               case 2: /* lgdt */
-                       rc = read_descriptor(ctxt, ops, src.ptr,
-                                            &size, &address, op_bytes);
+               case 0: /* vmcall */
+                       if (c->modrm_mod != 3 || c->modrm_rm != 1)
+                               goto cannot_emulate;
+
+                       rc = kvm_fix_hypercall(ctxt->vcpu);
                        if (rc)
                                goto done;
-                       realmode_lgdt(ctxt->vcpu, size, address);
+
+                       kvm_emulate_hypercall(ctxt->vcpu);
                        break;
-               case 3: /* lidt */
-                       rc = read_descriptor(ctxt, ops, src.ptr,
-                                            &size, &address, op_bytes);
+               case 2: /* lgdt */
+                       rc = read_descriptor(ctxt, ops, c->src.ptr,
+                                            &size, &address, c->op_bytes);
                        if (rc)
                                goto done;
-                       realmode_lidt(ctxt->vcpu, size, address);
+                       realmode_lgdt(ctxt->vcpu, size, address);
+                       break;
+               case 3: /* lidt/vmmcall */
+                       if (c->modrm_mod == 3 && c->modrm_rm == 1) {
+                               rc = kvm_fix_hypercall(ctxt->vcpu);
+                               if (rc)
+                                       goto done;
+                               kvm_emulate_hypercall(ctxt->vcpu);
+                       } else {
+                               rc = read_descriptor(ctxt, ops, c->src.ptr,
+                                                    &size, &address,
+                                                    c->op_bytes);
+                               if (rc)
+                                       goto done;
+                               realmode_lidt(ctxt->vcpu, size, address);
+                       }
                        break;
                case 4: /* smsw */
-                       if (modrm_mod != 3)
+                       if (c->modrm_mod != 3)
                                goto cannot_emulate;
-                       *(u16 *)&_regs[modrm_rm]
+                       *(u16 *)&c->regs[c->modrm_rm]
                                = realmode_get_cr(ctxt->vcpu, 0);
                        break;
                case 6: /* lmsw */
-                       if (modrm_mod != 3)
+                       if (c->modrm_mod != 3)
                                goto cannot_emulate;
-                       realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
+                       realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
+                                                 &ctxt->eflags);
                        break;
                case 7: /* invlpg*/
                        emulate_invlpg(ctxt->vcpu, cr2);
@@ -1383,94 +1682,74 @@ twobyte_insn:
                default:
                        goto cannot_emulate;
                }
+               /* Disable writeback. */
+               c->dst.type = OP_NONE;
                break;
        case 0x21: /* mov from dr to reg */
-               no_wb = 1;
-               if (modrm_mod != 3)
+               if (c->modrm_mod != 3)
                        goto cannot_emulate;
-               rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
+               rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+               if (rc)
+                       goto cannot_emulate;
+               c->dst.type = OP_NONE;  /* no writeback */
                break;
        case 0x23: /* mov from reg to dr */
-               no_wb = 1;
-               if (modrm_mod != 3)
+               if (c->modrm_mod != 3)
+                       goto cannot_emulate;
+               rc = emulator_set_dr(ctxt, c->modrm_reg,
+                                    c->regs[c->modrm_rm]);
+               if (rc)
                        goto cannot_emulate;
-               rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
+               c->dst.type = OP_NONE;  /* no writeback */
                break;
        case 0x40 ... 0x4f:     /* cmov */
-               dst.val = dst.orig_val = src.val;
-               no_wb = 1;
-               /*
-                * First, assume we're decoding an even cmov opcode
-                * (lsb == 0).
-                */
-               switch ((b & 15) >> 1) {
-               case 0: /* cmovo */
-                       no_wb = (_eflags & EFLG_OF) ? 0 : 1;
-                       break;
-               case 1: /* cmovb/cmovc/cmovnae */
-                       no_wb = (_eflags & EFLG_CF) ? 0 : 1;
-                       break;
-               case 2: /* cmovz/cmove */
-                       no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
-                       break;
-               case 3: /* cmovbe/cmovna */
-                       no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
-                       break;
-               case 4: /* cmovs */
-                       no_wb = (_eflags & EFLG_SF) ? 0 : 1;
-                       break;
-               case 5: /* cmovp/cmovpe */
-                       no_wb = (_eflags & EFLG_PF) ? 0 : 1;
-                       break;
-               case 7: /* cmovle/cmovng */
-                       no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
-                       /* fall through */
-               case 6: /* cmovl/cmovnge */
-                       no_wb &= (!(_eflags & EFLG_SF) !=
-                             !(_eflags & EFLG_OF)) ? 0 : 1;
-                       break;
-               }
-               /* Odd cmov opcodes (lsb == 1) have inverted sense. */
-               no_wb ^= b & 1;
+               c->dst.val = c->dst.orig_val = c->src.val;
+               if (!test_cc(c->b, ctxt->eflags))
+                       c->dst.type = OP_NONE; /* no writeback */
                break;
        case 0xa3:
              bt:               /* bt */
-               src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-               emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
+               c->dst.type = OP_NONE;
+               /* only subword offset */
+               c->src.val &= (c->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
                break;
        case 0xab:
              bts:              /* bts */
-               src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-               emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
+               /* only subword offset */
+               c->src.val &= (c->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
                break;
        case 0xb0 ... 0xb1:     /* cmpxchg */
                /*
                 * Save real source value, then compare EAX against
                 * destination.
                 */
-               src.orig_val = src.val;
-               src.val = _regs[VCPU_REGS_RAX];
-               emulate_2op_SrcV("cmp", src, dst, _eflags);
-               if (_eflags & EFLG_ZF) {
+               c->src.orig_val = c->src.val;
+               c->src.val = c->regs[VCPU_REGS_RAX];
+               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+               if (ctxt->eflags & EFLG_ZF) {
                        /* Success: write back to memory. */
-                       dst.val = src.orig_val;
+                       c->dst.val = c->src.orig_val;
                } else {
                        /* Failure: write the value we saw to EAX. */
-                       dst.type = OP_REG;
-                       dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
+                       c->dst.type = OP_REG;
+                       c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
                }
                break;
        case 0xb3:
              btr:              /* btr */
-               src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-               emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
+               /* only subword offset */
+               c->src.val &= (c->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
                break;
        case 0xb6 ... 0xb7:     /* movzx */
-               dst.bytes = op_bytes;
-               dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
+                                                      : (u16) c->src.val;
                break;
        case 0xba:              /* Grp8 */
-               switch (modrm_reg & 3) {
+               switch (c->modrm_reg & 3) {
                case 0:
                        goto bt;
                case 1:
@@ -1483,146 +1762,103 @@ twobyte_insn:
                break;
        case 0xbb:
              btc:              /* btc */
-               src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-               emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
+               /* only subword offset */
+               c->src.val &= (c->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
                break;
        case 0xbe ... 0xbf:     /* movsx */
-               dst.bytes = op_bytes;
-               dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
+                                                       (s16) c->src.val;
+               break;
+       case 0xc3:              /* movnti */
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
+                                                       (u64) c->src.val;
                break;
        }
        goto writeback;
 
 twobyte_special_insn:
-       /* Disable writeback. */
-       no_wb = 1;
-       switch (b) {
+       switch (c->b) {
        case 0x06:
                emulate_clts(ctxt->vcpu);
                break;
+       case 0x08:              /* invd */
+               break;
        case 0x09:              /* wbinvd */
                break;
        case 0x0d:              /* GrpP (prefetch) */
        case 0x18:              /* Grp16 (prefetch/nop) */
                break;
        case 0x20: /* mov cr, reg */
-               if (modrm_mod != 3)
+               if (c->modrm_mod != 3)
                        goto cannot_emulate;
-               _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
+               c->regs[c->modrm_rm] =
+                               realmode_get_cr(ctxt->vcpu, c->modrm_reg);
                break;
        case 0x22: /* mov reg, cr */
-               if (modrm_mod != 3)
+               if (c->modrm_mod != 3)
                        goto cannot_emulate;
-               realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
+               realmode_set_cr(ctxt->vcpu,
+                               c->modrm_reg, c->modrm_val, &ctxt->eflags);
                break;
        case 0x30:
                /* wrmsr */
-               msr_data = (u32)_regs[VCPU_REGS_RAX]
-                       | ((u64)_regs[VCPU_REGS_RDX] << 32);
-               rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
+               msr_data = (u32)c->regs[VCPU_REGS_RAX]
+                       | ((u64)c->regs[VCPU_REGS_RDX] << 32);
+               rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
                if (rc) {
                        kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
-                       _eip = ctxt->vcpu->rip;
+                       c->eip = ctxt->vcpu->rip;
                }
                rc = X86EMUL_CONTINUE;
                break;
        case 0x32:
                /* rdmsr */
-               rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
+               rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
                if (rc) {
                        kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
-                       _eip = ctxt->vcpu->rip;
+                       c->eip = ctxt->vcpu->rip;
                } else {
-                       _regs[VCPU_REGS_RAX] = (u32)msr_data;
-                       _regs[VCPU_REGS_RDX] = msr_data >> 32;
+                       c->regs[VCPU_REGS_RAX] = (u32)msr_data;
+                       c->regs[VCPU_REGS_RDX] = msr_data >> 32;
                }
                rc = X86EMUL_CONTINUE;
                break;
        case 0x80 ... 0x8f: /* jnz rel, etc*/ {
                long int rel;
 
-               switch (op_bytes) {
+               switch (c->op_bytes) {
                case 2:
-                       rel = insn_fetch(s16, 2, _eip);
+                       rel = insn_fetch(s16, 2, c->eip);
                        break;
                case 4:
-                       rel = insn_fetch(s32, 4, _eip);
+                       rel = insn_fetch(s32, 4, c->eip);
                        break;
                case 8:
-                       rel = insn_fetch(s64, 8, _eip);
+                       rel = insn_fetch(s64, 8, c->eip);
                        break;
                default:
                        DPRINTF("jnz: Invalid op_bytes\n");
                        goto cannot_emulate;
                }
-               if (test_cc(b, _eflags))
+               if (test_cc(c->b, ctxt->eflags))
                        JMP_REL(rel);
                break;
        }
        case 0xc7:              /* Grp9 (cmpxchg8b) */
-               {
-                       u64 old, new;
-                       if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
-                                                                       != 0)
-                               goto done;
-                       if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
-                           ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
-                               _regs[VCPU_REGS_RAX] = (u32) (old >> 0);
-                               _regs[VCPU_REGS_RDX] = (u32) (old >> 32);
-                               _eflags &= ~EFLG_ZF;
-                       } else {
-                               new = ((u64)_regs[VCPU_REGS_RCX] << 32)
-                                       | (u32) _regs[VCPU_REGS_RBX];
-                               if ((rc = ops->cmpxchg_emulated(cr2, &old,
-                                                         &new, 8, ctxt->vcpu)) != 0)
-                                       goto done;
-                               _eflags |= EFLG_ZF;
-                       }
-                       break;
-               }
+               rc = emulate_grp9(ctxt, ops, cr2);
+               if (rc != 0)
+                       goto done;
+               break;
        }
+       /* Disable writeback. */
+       c->dst.type = OP_NONE;
        goto writeback;
 
 cannot_emulate:
-       DPRINTF("Cannot emulate %02x\n", b);
+       DPRINTF("Cannot emulate %02x\n", c->b);
+       c->eip = saved_eip;
        return -1;
 }
-
-#ifdef __XEN__
-
-#include <asm/mm.h>
-#include <asm/uaccess.h>
-
-int
-x86_emulate_read_std(unsigned long addr,
-                    unsigned long *val,
-                    unsigned int bytes, struct x86_emulate_ctxt *ctxt)
-{
-       unsigned int rc;
-
-       *val = 0;
-
-       if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
-               propagate_page_fault(addr + bytes - rc, 0);     /* read fault */
-               return X86EMUL_PROPAGATE_FAULT;
-       }
-
-       return X86EMUL_CONTINUE;
-}
-
-int
-x86_emulate_write_std(unsigned long addr,
-                     unsigned long val,
-                     unsigned int bytes, struct x86_emulate_ctxt *ctxt)
-{
-       unsigned int rc;
-
-       if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
-               propagate_page_fault(addr + bytes - rc, PGERR_write_access);
-               return X86EMUL_PROPAGATE_FAULT;
-       }
-
-       return X86EMUL_CONTINUE;
-}
-
-#endif