xen: clean up domain mode predicates
[safe/jmp/linux-2.6] / arch / x86 / xen / enlighten.c
index c13698f..b106e82 100644 (file)
@@ -30,7 +30,6 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
-#include <xen/interface/sched.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/hvc-console.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/fixmap.h>
 #include <asm/processor.h>
+#include <asm/msr-index.h>
 #include <asm/setup.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/reboot.h>
-#include <asm/pgalloc.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -57,6 +56,21 @@ EXPORT_SYMBOL_GPL(hypercall_page);
 DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 
+enum xen_domain_type xen_domain_type = XEN_NATIVE;
+EXPORT_SYMBOL_GPL(xen_domain_type);
+
+/*
+ * Identity map, in addition to plain kernel map.  This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+
 /*
  * Note about cr3 (pagetable base) values:
  *
@@ -168,10 +182,14 @@ void xen_vcpu_restore(void)
 
 static void __init xen_banner(void)
 {
+       unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
+       struct xen_extraversion extra;
+       HYPERVISOR_xen_version(XENVER_extraversion, &extra);
+
        printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
               pv_info.name);
-       printk(KERN_INFO "Hypervisor signature: %s%s\n",
-              xen_start_info->magic,
+       printk(KERN_INFO "Xen version: %d.%d%s%s\n",
+              version >> 16, version & 0xffff, extra.extraversion,
               xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
 }
 
@@ -210,103 +228,68 @@ static unsigned long xen_get_debugreg(int reg)
        return HYPERVISOR_get_debugreg(reg);
 }
 
-static unsigned long xen_save_fl(void)
+static void xen_leave_lazy(void)
 {
-       struct vcpu_info *vcpu;
-       unsigned long flags;
-
-       vcpu = x86_read_percpu(xen_vcpu);
-
-       /* flag has opposite sense of mask */
-       flags = !vcpu->evtchn_upcall_mask;
-
-       /* convert to IF type flag
-          -0 -> 0x00000000
-          -1 -> 0xffffffff
-       */
-       return (-flags) & X86_EFLAGS_IF;
+       paravirt_leave_lazy(paravirt_get_lazy_mode());
+       xen_mc_flush();
 }
 
-static void xen_restore_fl(unsigned long flags)
+static unsigned long xen_store_tr(void)
 {
-       struct vcpu_info *vcpu;
-
-       /* convert from IF type flag */
-       flags = !(flags & X86_EFLAGS_IF);
-
-       /* There's a one instruction preempt window here.  We need to
-          make sure we're don't switch CPUs between getting the vcpu
-          pointer and updating the mask. */
-       preempt_disable();
-       vcpu = x86_read_percpu(xen_vcpu);
-       vcpu->evtchn_upcall_mask = flags;
-       preempt_enable_no_resched();
-
-       /* Doesn't matter if we get preempted here, because any
-          pending event will get dealt with anyway. */
-
-       if (flags == 0) {
-               preempt_check_resched();
-               barrier(); /* unmask then check (avoid races) */
-               if (unlikely(vcpu->evtchn_upcall_pending))
-                       force_evtchn_callback();
-       }
+       return 0;
 }
 
-static void xen_irq_disable(void)
+/*
+ * Set the page permissions for a particular virtual address.  If the
+ * address is a vmalloc mapping (or other non-linear mapping), then
+ * find the linear mapping of the page and also set its protections to
+ * match.
+ */
+static void set_aliased_prot(void *v, pgprot_t prot)
 {
-       /* There's a one instruction preempt window here.  We need to
-          make sure we're don't switch CPUs between getting the vcpu
-          pointer and updating the mask. */
-       preempt_disable();
-       x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
-       preempt_enable_no_resched();
-}
+       int level;
+       pte_t *ptep;
+       pte_t pte;
+       unsigned long pfn;
+       struct page *page;
 
-static void xen_irq_enable(void)
-{
-       struct vcpu_info *vcpu;
+       ptep = lookup_address((unsigned long)v, &level);
+       BUG_ON(ptep == NULL);
 
-       /* We don't need to worry about being preempted here, since
-          either a) interrupts are disabled, so no preemption, or b)
-          the caller is confused and is trying to re-enable interrupts
-          on an indeterminate processor. */
+       pfn = pte_pfn(*ptep);
+       page = pfn_to_page(pfn);
 
-       vcpu = x86_read_percpu(xen_vcpu);
-       vcpu->evtchn_upcall_mask = 0;
+       pte = pfn_pte(pfn, prot);
 
-       /* Doesn't matter if we get preempted here, because any
-          pending event will get dealt with anyway. */
+       if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
+               BUG();
 
-       barrier(); /* unmask then check (avoid races) */
-       if (unlikely(vcpu->evtchn_upcall_pending))
-               force_evtchn_callback();
-}
+       if (!PageHighMem(page)) {
+               void *av = __va(PFN_PHYS(pfn));
 
-static void xen_safe_halt(void)
-{
-       /* Blocking includes an implicit local_irq_enable(). */
-       if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
-               BUG();
+               if (av != v)
+                       if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
+                               BUG();
+       } else
+               kmap_flush_unused();
 }
 
-static void xen_halt(void)
+static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 {
-       if (irqs_disabled())
-               HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
-       else
-               xen_safe_halt();
-}
+       const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
+       int i;
 
-static void xen_leave_lazy(void)
-{
-       paravirt_leave_lazy(paravirt_get_lazy_mode());
-       xen_mc_flush();
+       for(i = 0; i < entries; i += entries_per_page)
+               set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
 }
 
-static unsigned long xen_store_tr(void)
+static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
 {
-       return 0;
+       const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
+       int i;
+
+       for(i = 0; i < entries; i += entries_per_page)
+               set_aliased_prot(ldt + i, PAGE_KERNEL);
 }
 
 static void xen_set_ldt(const void *addr, unsigned entries)
@@ -409,8 +392,7 @@ static void xen_load_gs_index(unsigned int idx)
 static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
                                const void *ptr)
 {
-       unsigned long lp = (unsigned long)&dt[entrynum];
-       xmaddr_t mach_lp = virt_to_machine(lp);
+       xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
        u64 entry = *(u64 *)ptr;
 
        preempt_disable();
@@ -543,7 +525,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 }
 
 static void xen_load_sp0(struct tss_struct *tss,
-                         struct thread_struct *thread)
+                        struct thread_struct *thread)
 {
        struct multicall_space mcs = xen_mc_entry(0);
        MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
@@ -711,33 +693,102 @@ static void set_current_cr3(void *v)
        x86_write_percpu(xen_current_cr3, (unsigned long)v);
 }
 
-static void xen_write_cr3(unsigned long cr3)
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
 {
        struct mmuext_op *op;
        struct multicall_space mcs;
-       unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+       unsigned long mfn;
 
-       BUG_ON(preemptible());
+       if (cr3)
+               mfn = pfn_to_mfn(PFN_DOWN(cr3));
+       else
+               mfn = 0;
 
-       mcs = xen_mc_entry(sizeof(*op));  /* disables interrupts */
+       WARN_ON(mfn == 0 && kernel);
 
-       /* Update while interrupts are disabled, so its atomic with
-          respect to ipis */
-       x86_write_percpu(xen_cr3, cr3);
+       mcs = __xen_mc_entry(sizeof(*op));
 
        op = mcs.args;
-       op->cmd = MMUEXT_NEW_BASEPTR;
+       op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
        op->arg1.mfn = mfn;
 
        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 
-       /* Update xen_update_cr3 once the batch has actually
-          been submitted. */
-       xen_mc_callback(set_current_cr3, (void *)cr3);
+       if (kernel) {
+               x86_write_percpu(xen_cr3, cr3);
+
+               /* Update xen_current_cr3 once the batch has actually
+                  been submitted. */
+               xen_mc_callback(set_current_cr3, (void *)cr3);
+       }
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+       BUG_ON(preemptible());
+
+       xen_mc_batch();  /* disables interrupts */
+
+       /* Update while interrupts are disabled, so its atomic with
+          respect to ipis */
+       x86_write_percpu(xen_cr3, cr3);
+
+       __xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+       {
+               pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+               if (user_pgd)
+                       __xen_write_cr3(false, __pa(user_pgd));
+               else
+                       __xen_write_cr3(false, 0);
+       }
+#endif
 
        xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
 }
 
+static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+{
+       int ret;
+
+       ret = 0;
+
+       switch(msr) {
+#ifdef CONFIG_X86_64
+               unsigned which;
+               u64 base;
+
+       case MSR_FS_BASE:               which = SEGBASE_FS; goto set;
+       case MSR_KERNEL_GS_BASE:        which = SEGBASE_GS_USER; goto set;
+       case MSR_GS_BASE:               which = SEGBASE_GS_KERNEL; goto set;
+
+       set:
+               base = ((u64)high << 32) | low;
+               if (HYPERVISOR_set_segment_base(which, base) != 0)
+                       ret = -EFAULT;
+               break;
+#endif
+
+       case MSR_STAR:
+       case MSR_CSTAR:
+       case MSR_LSTAR:
+       case MSR_SYSCALL_MASK:
+       case MSR_IA32_SYSENTER_CS:
+       case MSR_IA32_SYSENTER_ESP:
+       case MSR_IA32_SYSENTER_EIP:
+               /* Fast syscall setup is all done in hypercalls, so
+                  these are all ignored.  Stub them out here to stop
+                  Xen console noise. */
+               break;
+
+       default:
+               ret = native_write_msr_safe(msr, low, high);
+       }
+
+       return ret;
+}
+
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
 static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -774,7 +825,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
                SetPagePinned(page);
 
                if (!PageHighMem(page)) {
-                       make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+                       make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
                        if (level == PT_PTE)
                                pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
                } else
@@ -794,6 +845,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
        xen_alloc_ptpage(mm, pfn, PT_PMD);
 }
 
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+       pgd_t *pgd = mm->pgd;
+       int ret = 0;
+
+       BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+       {
+               struct page *page = virt_to_page(pgd);
+               pgd_t *user_pgd;
+
+               BUG_ON(page->private != 0);
+
+               ret = -ENOMEM;
+
+               user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+               page->private = (unsigned long)user_pgd;
+
+               if (user_pgd != NULL) {
+                       user_pgd[pgd_index(VSYSCALL_START)] =
+                               __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+                       ret = 0;
+               }
+
+               BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+       }
+#endif
+
+       return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+       pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+       if (user_pgd)
+               free_page((unsigned long)user_pgd);
+#endif
+}
+
 /* This should never happen until we're OK to use struct page */
 static void xen_release_ptpage(u32 pfn, unsigned level)
 {
@@ -916,6 +1009,9 @@ static __init void xen_post_allocator_init(void)
        pv_mmu_ops.release_pud = xen_release_pud;
 #endif
 
+#ifdef CONFIG_X86_64
+       SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
        xen_mark_init_mm_pinned();
 }
 
@@ -1011,7 +1107,9 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
 #ifdef CONFIG_X86_32
        case FIX_WP_TEST:
        case FIX_VDSO:
+# ifdef CONFIG_HIGHMEM
        case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
 #else
        case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
 #endif
@@ -1027,6 +1125,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
        }
 
        __native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_64
+       /* Replicate changes to map the vsyscall page into the user
+          pagetable vsyscall mapping. */
+       if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+               unsigned long vaddr = __fix_to_virt(idx);
+               set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+       }
+#endif
 }
 
 static const struct pv_info xen_info __initdata = {
@@ -1072,12 +1179,16 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
        .wbinvd = native_wbinvd,
 
        .read_msr = native_read_msr_safe,
-       .write_msr = native_write_msr_safe,
+       .write_msr = xen_write_msr_safe,
        .read_tsc = native_read_tsc,
        .read_pmc = native_read_pmc,
 
        .iret = xen_iret,
        .irq_enable_sysexit = xen_sysexit,
+#ifdef CONFIG_X86_64
+       .usergs_sysret32 = xen_sysret32,
+       .usergs_sysret64 = xen_sysret64,
+#endif
 
        .load_tr_desc = paravirt_nop,
        .set_ldt = xen_set_ldt,
@@ -1088,6 +1199,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
        .load_gs_index = xen_load_gs_index,
 #endif
 
+       .alloc_ldt = xen_alloc_ldt,
+       .free_ldt = xen_free_ldt,
+
        .store_gdt = native_store_gdt,
        .store_idt = native_store_idt,
        .store_tr = xen_store_tr,
@@ -1109,40 +1223,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
        },
 };
 
-static void __init __xen_init_IRQ(void)
-{
-#ifdef CONFIG_X86_64
-       int i;
-
-       /* Create identity vector->irq map */
-       for(i = 0; i < NR_VECTORS; i++) {
-               int cpu;
-
-               for_each_possible_cpu(cpu)
-                       per_cpu(vector_irq, cpu)[i] = i;
-       }
-#endif /* CONFIG_X86_64 */
-
-       xen_init_IRQ();
-}
-
-static const struct pv_irq_ops xen_irq_ops __initdata = {
-       .init_IRQ = __xen_init_IRQ,
-       .save_fl = xen_save_fl,
-       .restore_fl = xen_restore_fl,
-       .irq_disable = xen_irq_disable,
-       .irq_enable = xen_irq_enable,
-       .safe_halt = xen_safe_halt,
-       .halt = xen_halt,
-#ifdef CONFIG_X86_64
-       .adjust_exception_frame = xen_adjust_exception_frame,
-#endif
-};
-
 static const struct pv_apic_ops xen_apic_ops __initdata = {
 #ifdef CONFIG_X86_LOCAL_APIC
        .apic_write = xen_apic_write,
-       .apic_write_atomic = xen_apic_write,
        .apic_read = xen_apic_read,
        .setup_boot_clock = paravirt_nop,
        .setup_secondary_clock = paravirt_nop,
@@ -1168,8 +1251,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
        .pte_update = paravirt_nop,
        .pte_update_defer = paravirt_nop,
 
-       .pgd_alloc = __paravirt_pgd_alloc,
-       .pgd_free = paravirt_nop,
+       .pgd_alloc = xen_pgd_alloc,
+       .pgd_free = xen_pgd_free,
 
        .alloc_pte = xen_alloc_pte_init,
        .release_pte = xen_release_pte_init,
@@ -1193,7 +1276,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
        .ptep_modify_prot_commit = __ptep_modify_prot_commit,
 
        .pte_val = xen_pte_val,
-       .pte_flags = native_pte_val,
+       .pte_flags = native_pte_flags,
        .pgd_val = xen_pgd_val,
 
        .make_pte = xen_make_pte,
@@ -1304,7 +1387,7 @@ static unsigned long m2p(phys_addr_t maddr)
 {
        phys_addr_t paddr;
 
-       maddr &= PTE_MASK;
+       maddr &= PTE_PFN_MASK;
        paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
 
        return paddr;
@@ -1362,13 +1445,6 @@ static void set_page_prot(void *addr, pgprot_t prot)
                BUG();
 }
 
-/*
- * Identity map, in addition to plain kernel map.  This needs to be
- * large enough to allocate page table pages to allocate the rest.
- * Each page can map 2MB.
- */
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
-
 static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
 {
        unsigned pmdidx, pteidx;
@@ -1468,6 +1544,7 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf
        set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
        set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
        set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+       set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
        set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
        set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
 
@@ -1480,7 +1557,15 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf
 
        /* Switch over */
        pgd = init_level4_pgt;
-       xen_write_cr3(__pa(pgd));
+
+       /*
+        * At this stage there can be no user pgd, and no page
+        * structure to attach it to, so make sure we just set kernel
+        * pgd.
+        */
+       xen_mc_batch();
+       __xen_write_cr3(true, __pa(pgd));
+       xen_mc_issue(PARAVIRT_LAZY_CPU);
 
        reserve_early(__pa(xen_start_info->pt_base),
                      __pa(xen_start_info->pt_base +
@@ -1531,6 +1616,8 @@ asmlinkage void __init xen_start_kernel(void)
        if (!xen_start_info)
                return;
 
+       xen_domain_type = XEN_PV_DOMAIN;
+
        BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
 
        xen_setup_features();
@@ -1540,10 +1627,11 @@ asmlinkage void __init xen_start_kernel(void)
        pv_init_ops = xen_init_ops;
        pv_time_ops = xen_time_ops;
        pv_cpu_ops = xen_cpu_ops;
-       pv_irq_ops = xen_irq_ops;
        pv_apic_ops = xen_apic_ops;
        pv_mmu_ops = xen_mmu_ops;
 
+       xen_init_irq_ops();
+
        if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
                pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
                pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
@@ -1567,7 +1655,7 @@ asmlinkage void __init xen_start_kernel(void)
 
        /* Prevent unwanted bits from being set in PTEs. */
        __supported_pte_mask &= ~_PAGE_GLOBAL;
-       if (!is_initial_xendomain())
+       if (!xen_initial_domain())
                __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
        /* Don't do the full vcpu_info placement stuff until we have a
@@ -1602,7 +1690,7 @@ asmlinkage void __init xen_start_kernel(void)
        boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
        boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
 
-       if (!is_initial_xendomain()) {
+       if (!xen_initial_domain()) {
                add_preferred_console("xenboot", 0, NULL);
                add_preferred_console("tty", 0, NULL);
                add_preferred_console("hvc", 0, NULL);