lguest: restrict CPUID to avoid perf counter wrmsr

[safe/jmp/linux-2.6] / arch / x86 / lguest / boot.c
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c

index 5287081..f2bf1f7 100644 (file)
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -67,6 +67,7 @@
  #include <asm/mce.h>
  #include <asm/io.h>
  #include <asm/i387.h>
+#include <asm/stackprotector.h>
  #include <asm/reboot.h>                /* for struct machine_ops */
  
  /*G:010 Welcome to the Guest!
@@ -86,7 +87,7 @@ struct lguest_data lguest_data = {
  
  /*G:037 async_hcall() is pretty simple: I'm quite proud of it really.  We have a
   * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall.  Each entry in the ring has 4 slots for the hypercall
+ * do a normal hypercall.  Each entry in the ring has 5 slots for the hypercall
   * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
   * and 255 once the Host has finished with it.
   *
@@ -95,7 +96,8 @@ struct lguest_data lguest_data = {
   * effect of causing the Host to run all the stored calls in the ring buffer
   * which empties it for next time! */
  static void async_hcall(unsigned long call, unsigned long arg1,
-                       unsigned long arg2, unsigned long arg3)
+                       unsigned long arg2, unsigned long arg3,
+                       unsigned long arg4)
  {
         /* Note: This code assumes we're uniprocessor. */
         static unsigned int next_call;
@@ -107,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1,
         local_irq_save(flags);
         if (lguest_data.hcall_status[next_call] != 0xFF) {
                 /* Table full, so do normal hcall which will flush table. */
-               hcall(call, arg1, arg2, arg3);
+               kvm_hypercall4(call, arg1, arg2, arg3, arg4);
         } else {
                 lguest_data.hcalls[next_call].arg0 = call;
                 lguest_data.hcalls[next_call].arg1 = arg1;
                 lguest_data.hcalls[next_call].arg2 = arg2;
                 lguest_data.hcalls[next_call].arg3 = arg3;
+               lguest_data.hcalls[next_call].arg4 = arg4;
                 /* Arguments must all be written before we mark it to go */
                 wmb();
                 lguest_data.hcall_status[next_call] = 0;
@@ -134,32 +137,65 @@ static void async_hcall(unsigned long call, unsigned long arg1,
   *
   * So, when we're in lazy mode, we call async_hcall() to store the call for
   * future processing: */
-static void lazy_hcall(unsigned long call,
+static void lazy_hcall1(unsigned long call,
+                      unsigned long arg1)
+{
+       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+               kvm_hypercall1(call, arg1);
+       else
+               async_hcall(call, arg1, 0, 0, 0);
+}
+
+static void lazy_hcall2(unsigned long call,
+                      unsigned long arg1,
+                      unsigned long arg2)
+{
+       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+               kvm_hypercall2(call, arg1, arg2);
+       else
+               async_hcall(call, arg1, arg2, 0, 0);
+}
+
+static void lazy_hcall3(unsigned long call,
                        unsigned long arg1,
                        unsigned long arg2,
                        unsigned long arg3)
  {
         if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-               hcall(call, arg1, arg2, arg3);
+               kvm_hypercall3(call, arg1, arg2, arg3);
         else
-               async_hcall(call, arg1, arg2, arg3);
+               async_hcall(call, arg1, arg2, arg3, 0);
  }
  
+#ifdef CONFIG_X86_PAE
+static void lazy_hcall4(unsigned long call,
+                      unsigned long arg1,
+                      unsigned long arg2,
+                      unsigned long arg3,
+                      unsigned long arg4)
+{
+       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+               kvm_hypercall4(call, arg1, arg2, arg3, arg4);
+       else
+               async_hcall(call, arg1, arg2, arg3, arg4);
+}
+#endif
+
  /* When lazy mode is turned off reset the per-cpu lazy mode variable and then
   * issue the do-nothing hypercall to flush any stored calls. */
  static void lguest_leave_lazy_mmu_mode(void)
  {
-       hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+       kvm_hypercall0(LHCALL_FLUSH_ASYNC);
         paravirt_leave_lazy_mmu();
  }
  
  static void lguest_end_context_switch(struct task_struct *next)
  {
-       hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+       kvm_hypercall0(LHCALL_FLUSH_ASYNC);
         paravirt_end_context_switch(next);
  }
  
-/*G:033
+/*G:032
   * After that diversion we return to our first native-instruction
   * replacements: four functions for interrupt control.
   *
@@ -179,30 +215,28 @@ static unsigned long save_fl(void)
  {
         return lguest_data.irq_enabled;
  }
-PV_CALLEE_SAVE_REGS_THUNK(save_fl);
-
-/* restore_flags() just sets the flags back to the value given. */
-static void restore_fl(unsigned long flags)
-{
-       lguest_data.irq_enabled = flags;
-}
-PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
  
  /* Interrupts go off... */
  static void irq_disable(void)
  {
         lguest_data.irq_enabled = 0;
  }
+
+/* Let's pause a moment.  Remember how I said these are called so often?
+ * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
+ * break some rules.  In particular, these functions are assumed to save their
+ * own registers if they need to: normal C functions assume they can trash the
+ * eax register.  To use normal C functions, we use
+ * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
+ * C function, then restores it. */
+PV_CALLEE_SAVE_REGS_THUNK(save_fl);
  PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
+/*:*/
  
-/* Interrupts go on... */
-static void irq_enable(void)
-{
-       lguest_data.irq_enabled = X86_EFLAGS_IF;
-}
-PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
+/* These are in i386_head.S */
+extern void lg_irq_enable(void);
+extern void lg_restore_fl(unsigned long flags);
  
-/*:*/
  /*M:003 Note that we don't check for outstanding interrupts when we re-enable
   * them (or when we unmask an interrupt).  This seems to work for the moment,
   * since interrupts are rare and we'll just get the interrupt on the next timer
@@ -235,7 +269,7 @@ static void lguest_write_idt_entry(gate_desc *dt,
         /* Keep the local copy up to date. */
         native_write_idt_entry(dt, entrynum, g);
         /* Tell Host about this new entry. */
-       hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
+       kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
  }
  
  /* Changing to a different IDT is very rare: we keep the IDT up-to-date every
@@ -247,7 +281,7 @@ static void lguest_load_idt(const struct desc_ptr *desc)
         struct desc_struct *idt = (void *)desc->address;
  
         for (i = 0; i < (desc->size+1)/8; i++)
-               hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
+               kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
  }
  
  /*
@@ -260,15 +294,15 @@ static void lguest_load_idt(const struct desc_ptr *desc)
   * controls the entire thing and the Guest asks it to make changes using the
   * LOAD_GDT hypercall.
   *
- * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY
- * hypercall and use that repeatedly to load a new IDT.  I don't think it
- * really matters, but wouldn't it be nice if they were the same?  Wouldn't
- * it be even better if you were the one to send the patch to fix it?
+ * This is the exactly like the IDT code.
   */
  static void lguest_load_gdt(const struct desc_ptr *desc)
  {
-       BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
-       hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
+       unsigned int i;
+       struct desc_struct *gdt = (void *)desc->address;
+
+       for (i = 0; i < (desc->size+1)/8; i++)
+               kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
  }
  
  /* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
@@ -278,7 +312,9 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
                                    const void *desc, int type)
  {
         native_write_gdt_entry(dt, entrynum, desc, type);
-       hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
+       /* Tell Host about this new entry. */
+       kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, entrynum,
+                      dt[entrynum].a, dt[entrynum].b);
  }
  
  /* OK, I lied.  There are three "thread local storage" GDT entries which change
@@ -290,7 +326,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
          * can't handle us removing entries we're currently using.  So we clear
          * the GS register here: if it's needed it'll be reloaded anyway. */
         lazy_load_gs(0);
-       lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
+       lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
  }
  
  /*G:038 That's enough excitement for now, back to ploughing through each of
@@ -343,11 +379,16 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
  
         native_cpuid(ax, bx, cx, dx);
         switch (function) {
+       case 0: /* ID and highest CPUID.  Futureproof a little by sticking to
+                * older ones. */
+               if (*ax > 5)
+                       *ax = 5;
+               break;
         case 1: /* Basic feature request. */
                 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
                 *cx &= 0x00002201;
-               /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */
-               *dx &= 0x07808111;
+               /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
+               *dx &= 0x07808151;
                 /* The Host can do a nice optimization if it knows that the
                  * kernel mappings (addresses above 0xC0000000 or whatever
                  * PAGE_OFFSET is set to) haven't changed.  But Linux calls
@@ -366,6 +407,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
                 if (*ax > 0x80000008)
                         *ax = 0x80000008;
                 break;
+       case 0x80000001:
+               /* Here we should fix nx cap depending on host. */
+               /* For this version of PAE, we just clear NX bit. */
+               *dx &= ~(1 << 20);
+               break;
         }
  }
  
@@ -388,7 +434,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
  static unsigned long current_cr0;
  static void lguest_write_cr0(unsigned long val)
  {
-       lazy_hcall(LHCALL_TS, val & X86_CR0_TS, 0, 0);
+       lazy_hcall1(LHCALL_TS, val & X86_CR0_TS);
         current_cr0 = val;
  }
  
@@ -402,7 +448,7 @@ static unsigned long lguest_read_cr0(void)
   * the vowels have been optimized out. */
  static void lguest_clts(void)
  {
-       lazy_hcall(LHCALL_TS, 0, 0, 0);
+       lazy_hcall1(LHCALL_TS, 0);
         current_cr0 &= ~X86_CR0_TS;
  }
  
@@ -424,7 +470,7 @@ static bool cr3_changed = false;
  static void lguest_write_cr3(unsigned long cr3)
  {
         lguest_data.pgdir = cr3;
-       lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
+       lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
         cr3_changed = true;
  }
  
@@ -496,22 +542,55 @@ static void lguest_write_cr4(unsigned long val)
   * into a process' address space.  We set the entry then tell the Host the
   * toplevel and address this corresponds to.  The Guest uses one pagetable per
   * process, so we need to tell the Host which one we're changing (mm->pgd). */
+static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
+                              pte_t *ptep)
+{
+#ifdef CONFIG_X86_PAE
+       lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
+                   ptep->pte_low, ptep->pte_high);
+#else
+       lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
+#endif
+}
+
  static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
                               pte_t *ptep, pte_t pteval)
  {
-       *ptep = pteval;
-       lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
+       native_set_pte(ptep, pteval);
+       lguest_pte_update(mm, addr, ptep);
+}
+
+/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
+ * to set a middle-level entry when PAE is activated.
+ * Again, we set the entry then tell the Host which page we changed,
+ * and the index of the entry we changed. */
+#ifdef CONFIG_X86_PAE
+static void lguest_set_pud(pud_t *pudp, pud_t pudval)
+{
+       native_set_pud(pudp, pudval);
+
+       /* 32 bytes aligned pdpt address and the index. */
+       lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
+                  (__pa(pudp) & 0x1F) / sizeof(pud_t));
+}
+
+static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+       native_set_pmd(pmdp, pmdval);
+       lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
+                  (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
  }
+#else
  
-/* The Guest calls this to set a top-level entry.  Again, we set the entry then
- * tell the Host which top-level page we changed, and the index of the entry we
- * changed. */
+/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not
+ * activated. */
  static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
  {
-       *pmdp = pmdval;
-       lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,
-                  (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
+       native_set_pmd(pmdp, pmdval);
+       lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
+                  (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
  }
+#endif
  
  /* There are a couple of legacy places where the kernel sets a PTE, but we
   * don't know the top level any more.  This is useless for us, since we don't
@@ -524,11 +603,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
   * which brings boot back to 0.25 seconds. */
  static void lguest_set_pte(pte_t *ptep, pte_t pteval)
  {
-       *ptep = pteval;
+       native_set_pte(ptep, pteval);
+       if (cr3_changed)
+               lazy_hcall1(LHCALL_FLUSH_TLB, 1);
+}
+
+#ifdef CONFIG_X86_PAE
+static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+       native_set_pte_atomic(ptep, pte);
         if (cr3_changed)
-               lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+               lazy_hcall1(LHCALL_FLUSH_TLB, 1);
  }
  
+void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+       native_pte_clear(mm, addr, ptep);
+       lguest_pte_update(mm, addr, ptep);
+}
+
+void lguest_pmd_clear(pmd_t *pmdp)
+{
+       lguest_set_pmd(pmdp, __pmd(0));
+}
+#endif
+
  /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
   * native page table operations.  On native hardware you can set a new page
   * table entry whenever you want, but if you want to remove one you have to do
@@ -542,7 +641,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
  static void lguest_flush_tlb_single(unsigned long addr)
  {
         /* Simply set it to zero: if it was not, it will fault back in. */
-       lazy_hcall(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
+       lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
  }
  
  /* This is what happens after the Guest has removed a large number of entries.
@@ -550,7 +649,7 @@ static void lguest_flush_tlb_single(unsigned long addr)
   * have changed, ie. virtual addresses below PAGE_OFFSET. */
  static void lguest_flush_tlb_user(void)
  {
-       lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
+       lazy_hcall1(LHCALL_FLUSH_TLB, 0);
  }
  
  /* This is called when the kernel page tables have changed.  That's not very
@@ -558,7 +657,7 @@ static void lguest_flush_tlb_user(void)
   * slow), so it's worth separating this from the user flushing above. */
  static void lguest_flush_tlb_kernel(void)
  {
-       lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+       lazy_hcall1(LHCALL_FLUSH_TLB, 1);
  }
  
  /*
@@ -600,13 +699,12 @@ static void __init lguest_init_IRQ(void)
  {
         unsigned int i;
  
-       for (i = 0; i < LGUEST_IRQS; i++) {
-               int vector = FIRST_EXTERNAL_VECTOR + i;
+       for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
                 /* Some systems map "vectors" to interrupts weirdly.  Lguest has
                  * a straightforward 1 to 1 mapping, so force that here. */
-               __get_cpu_var(vector_irq)[vector] = i;
-               if (vector != SYSCALL_VECTOR)
-                       set_intr_gate(vector, interrupt[i]);
+               __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
+               if (i != SYSCALL_VECTOR)
+                       set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
         }
         /* This call is required to set up for 4k stacks, where we have
          * separate stacks for hard and soft interrupts. */
@@ -615,7 +713,7 @@ static void __init lguest_init_IRQ(void)
  
  void lguest_setup_irq(unsigned int irq)
  {
-       irq_to_desc_alloc_cpu(irq, 0);
+       irq_to_desc_alloc_node(irq, 0);
         set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
                                       handle_level_irq, "level");
  }
@@ -642,7 +740,7 @@ static unsigned long lguest_tsc_khz(void)
  
  /* If we can't use the TSC, the kernel falls back to our lower-priority
   * "lguest_clock", where we read the time value given to us by the Host. */
-static cycle_t lguest_clock_read(void)
+static cycle_t lguest_clock_read(struct clocksource *cs)
  {
         unsigned long sec, nsec;
  
@@ -695,7 +793,7 @@ static int lguest_clockevent_set_next_event(unsigned long delta,
         }
  
         /* Please wake us this far in the future. */
-       hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
+       kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta);
         return 0;
  }
  
@@ -706,7 +804,7 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode,
         case CLOCK_EVT_MODE_UNUSED:
         case CLOCK_EVT_MODE_SHUTDOWN:
                 /* A 0 argument shuts the clock down. */
-               hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0);
+               kvm_hypercall0(LHCALL_SET_CLOCKEVENT);
                 break;
         case CLOCK_EVT_MODE_ONESHOT:
                 /* This is what we expect. */
@@ -781,8 +879,8 @@ static void lguest_time_init(void)
  static void lguest_load_sp0(struct tss_struct *tss,
                             struct thread_struct *thread)
  {
-       lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0,
-                  THREAD_SIZE/PAGE_SIZE);
+       lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
+                  THREAD_SIZE / PAGE_SIZE);
  }
  
  /* Let's just say, I wouldn't do debugging under a Guest. */
@@ -855,7 +953,7 @@ static void set_lguest_basic_apic_ops(void)
  /* STOP!  Until an interrupt comes in. */
  static void lguest_safe_halt(void)
  {
-       hcall(LHCALL_HALT, 0, 0, 0);
+       kvm_hypercall0(LHCALL_HALT);
  }
  
  /* The SHUTDOWN hypercall takes a string to describe what's happening, and
@@ -865,7 +963,8 @@ static void lguest_safe_halt(void)
   * rather than virtual addresses, so we use __pa() here. */
  static void lguest_power_off(void)
  {
-       hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0);
+       kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
+                                       LGUEST_SHUTDOWN_POWEROFF);
  }
  
  /*
@@ -875,7 +974,7 @@ static void lguest_power_off(void)
   */
  static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
  {
-       hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0);
+       kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF);
         /* The hcall won't return, but to keep gcc happy, we're "done". */
         return NOTIFY_DONE;
  }
@@ -916,7 +1015,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
                 len = sizeof(scratch) - 1;
         scratch[len] = '\0';
         memcpy(scratch, buf, len);
-       hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0);
+       kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch));
  
         /* This routine returns the number of bytes actually written. */
         return len;
@@ -926,7 +1025,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
   * Launcher to reboot us. */
  static void lguest_restart(char *reason)
  {
-       hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
+       kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
  }
  
  /*G:050
@@ -944,10 +1043,10 @@ static void lguest_restart(char *reason)
   *
   * Our current solution is to allow the paravirt back end to optionally patch
   * over the indirect calls to replace them with something more efficient.  We
- * patch the four most commonly called functions: disable interrupts, enable
- * interrupts, restore interrupts and save interrupts.  We usually have 6 or 10
- * bytes to patch into: the Guest versions of these operations are small enough
- * that we can fit comfortably.
+ * patch two of the simplest of the most commonly called functions: disable
+ * interrupts and save interrupts.  We usually have 6 or 10 bytes to patch
+ * into: the Guest versions of these operations are small enough that we can
+ * fit comfortably.
   *
   * First we need assembly templates of each of the patchable Guest operations,
   * and these are in i386_head.S. */
@@ -958,8 +1057,6 @@ static const struct lguest_insns
         const char *start, *end;
  } lguest_insns[] = {
         [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
-       [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
-       [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
         [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
  };
  
@@ -987,7 +1084,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
         return insn_len;
  }
  
-/*G:030 Once we get to lguest_init(), we know we're a Guest.  The various
+/*G:029 Once we get to lguest_init(), we know we're a Guest.  The various
   * pv_ops structures in the kernel provide points for (almost) every routine we
   * have to override to avoid privileged instructions. */
  __init void lguest_init(void)
@@ -997,6 +1094,7 @@ __init void lguest_init(void)
         pv_info.name = "lguest";
         pv_info.paravirt_enabled = 1;
         pv_info.kernel_rpl = 1;
+       pv_info.shared_kernel_pmd = 1;
  
         /* We set up all the lguest overrides for sensitive operations.  These
          * are detailed with the operations themselves. */
@@ -1004,9 +1102,9 @@ __init void lguest_init(void)
         /* interrupt-related operations */
         pv_irq_ops.init_IRQ = lguest_init_IRQ;
         pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
-       pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
+       pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
         pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
-       pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
+       pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
         pv_irq_ops.safe_halt = lguest_safe_halt;
  
         /* init-time operations */
@@ -1042,10 +1140,18 @@ __init void lguest_init(void)
         pv_mmu_ops.set_pte = lguest_set_pte;
         pv_mmu_ops.set_pte_at = lguest_set_pte_at;
         pv_mmu_ops.set_pmd = lguest_set_pmd;
+#ifdef CONFIG_X86_PAE
+       pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
+       pv_mmu_ops.pte_clear = lguest_pte_clear;
+       pv_mmu_ops.pmd_clear = lguest_pmd_clear;
+       pv_mmu_ops.set_pud = lguest_set_pud;
+#endif
         pv_mmu_ops.read_cr2 = lguest_read_cr2;
         pv_mmu_ops.read_cr3 = lguest_read_cr3;
         pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
         pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
+       pv_mmu_ops.pte_update = lguest_pte_update;
+       pv_mmu_ops.pte_update_defer = lguest_pte_update;
  
  #ifdef CONFIG_X86_LOCAL_APIC
         /* apic read/write intercepts */
@@ -1064,21 +1170,21 @@ __init void lguest_init(void)
          * lguest_init() where the rest of the fairly chaotic boot setup
          * occurs. */
  
-       /* The native boot code sets up initial page tables immediately after
-        * the kernel itself, and sets init_pg_tables_end so they're not
-        * clobbered.  The Launcher places our initial pagetables somewhere at
-        * the top of our physical memory, so we don't need extra space: set
-        * init_pg_tables_end to the end of the kernel. */
-       init_pg_tables_start = __pa(pg0);
-       init_pg_tables_end = __pa(pg0);
+       /* The stack protector is a weird thing where gcc places a canary
+        * value on the stack and then checks it on return.  This file is
+        * compiled with -fno-stack-protector it, so we got this far without
+        * problems.  The value of the canary is kept at offset 20 from the
+        * %gs register, so we need to set that up before calling C functions
+        * in other files. */
+       setup_stack_canary_segment(0);
+       /* We could just call load_stack_canary_segment(), but we might as
+        * call switch_to_new_gdt() which loads the whole table and sets up
+        * the per-cpu segment descriptor register %fs as well. */
+       switch_to_new_gdt(0);
  
         /* As described in head_32.S, we map the first 128M of memory. */
         max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
  
-       /* Load the %fs segment register (the per-cpu segment register) with
-        * the normal data segment to get through booting. */
-       asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
-
         /* The Host<->Guest Switcher lives at the top of our address space, and
          * the Host told us how big it is when we made LGUEST_INIT hypercall:
          * it put the answer in lguest_data.reserve_mem  */