KVM: MMU: do not free active mmu pages in free_mmu_pages()
[safe/jmp/linux-2.6] / virt / kvm / kvm_main.c
index 678e805..934dd1c 100644 (file)
 #include <linux/kvm_para.h>
 #include <linux/pagemap.h>
 #include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/bitops.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#include "coalesced_mmio.h"
+#endif
+
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include "irq.h"
+#endif
+
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
 DEFINE_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);
 
-static cpumask_t cpus_hardware_enabled;
+static cpumask_var_t cpus_hardware_enabled;
 
 struct kmem_cache *kvm_vcpu_cache;
 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
 
 static __read_mostly struct preempt_ops kvm_preempt_ops;
 
-static struct dentry *debugfs_dir;
+struct dentry *kvm_debugfs_dir;
 
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
 
+static bool kvm_rebooting;
+
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+                                                     int assigned_dev_id)
+{
+       struct list_head *ptr;
+       struct kvm_assigned_dev_kernel *match;
+
+       list_for_each(ptr, head) {
+               match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+               if (match->assigned_dev_id == assigned_dev_id)
+                       return match;
+       }
+       return NULL;
+}
+
+static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
+                                   *assigned_dev, int irq)
+{
+       int i, index;
+       struct msix_entry *host_msix_entries;
+
+       host_msix_entries = assigned_dev->host_msix_entries;
+
+       index = -1;
+       for (i = 0; i < assigned_dev->entries_nr; i++)
+               if (irq == host_msix_entries[i].vector) {
+                       index = i;
+                       break;
+               }
+       if (index < 0) {
+               printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
+               return 0;
+       }
+
+       return index;
+}
+
+static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
+{
+       struct kvm_assigned_dev_kernel *assigned_dev;
+       struct kvm *kvm;
+       int irq, i;
+
+       assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
+                                   interrupt_work);
+       kvm = assigned_dev->kvm;
+
+       /* This is taken to safely inject irq inside the guest. When
+        * the interrupt injection (or the ioapic code) uses a
+        * finer-grained lock, update this
+        */
+       mutex_lock(&kvm->lock);
+       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+               struct kvm_guest_msix_entry *guest_entries =
+                       assigned_dev->guest_msix_entries;
+               for (i = 0; i < assigned_dev->entries_nr; i++) {
+                       if (!(guest_entries[i].flags &
+                                       KVM_ASSIGNED_MSIX_PENDING))
+                               continue;
+                       guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
+                       kvm_set_irq(assigned_dev->kvm,
+                                   assigned_dev->irq_source_id,
+                                   guest_entries[i].vector, 1);
+                       irq = assigned_dev->host_msix_entries[i].vector;
+                       if (irq != 0)
+                               enable_irq(irq);
+                       assigned_dev->host_irq_disabled = false;
+               }
+       } else {
+               kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+                           assigned_dev->guest_irq, 1);
+               if (assigned_dev->irq_requested_type &
+                               KVM_DEV_IRQ_GUEST_MSI) {
+                       enable_irq(assigned_dev->host_irq);
+                       assigned_dev->host_irq_disabled = false;
+               }
+       }
+
+       mutex_unlock(&assigned_dev->kvm->lock);
+}
+
+static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
+{
+       struct kvm_assigned_dev_kernel *assigned_dev =
+               (struct kvm_assigned_dev_kernel *) dev_id;
+
+       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+               int index = find_index_from_host_irq(assigned_dev, irq);
+               if (index < 0)
+                       return IRQ_HANDLED;
+               assigned_dev->guest_msix_entries[index].flags |=
+                       KVM_ASSIGNED_MSIX_PENDING;
+       }
+
+       schedule_work(&assigned_dev->interrupt_work);
+
+       disable_irq_nosync(irq);
+       assigned_dev->host_irq_disabled = true;
+
+       return IRQ_HANDLED;
+}
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+       struct kvm_assigned_dev_kernel *dev;
+
+       if (kian->gsi == -1)
+               return;
+
+       dev = container_of(kian, struct kvm_assigned_dev_kernel,
+                          ack_notifier);
+
+       kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
+
+       /* The guest irq may be shared so this ack may be
+        * from another device.
+        */
+       if (dev->host_irq_disabled) {
+               enable_irq(dev->host_irq);
+               dev->host_irq_disabled = false;
+       }
+}
+
+static void deassign_guest_irq(struct kvm *kvm,
+                              struct kvm_assigned_dev_kernel *assigned_dev)
+{
+       kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
+       assigned_dev->ack_notifier.gsi = -1;
+
+       if (assigned_dev->irq_source_id != -1)
+               kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+       assigned_dev->irq_source_id = -1;
+       assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
+}
+
+/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
+static void deassign_host_irq(struct kvm *kvm,
+                             struct kvm_assigned_dev_kernel *assigned_dev)
+{
+       /*
+        * In kvm_free_device_irq, cancel_work_sync return true if:
+        * 1. work is scheduled, and then cancelled.
+        * 2. work callback is executed.
+        *
+        * The first one ensured that the irq is disabled and no more events
+        * would happen. But for the second one, the irq may be enabled (e.g.
+        * for MSI). So we disable irq here to prevent further events.
+        *
+        * Notice this maybe result in nested disable if the interrupt type is
+        * INTx, but it's OK for we are going to free it.
+        *
+        * If this function is a part of VM destroy, please ensure that till
+        * now, the kvm state is still legal for probably we also have to wait
+        * interrupt_work done.
+        */
+       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+               int i;
+               for (i = 0; i < assigned_dev->entries_nr; i++)
+                       disable_irq_nosync(assigned_dev->
+                                          host_msix_entries[i].vector);
+
+               cancel_work_sync(&assigned_dev->interrupt_work);
+
+               for (i = 0; i < assigned_dev->entries_nr; i++)
+                       free_irq(assigned_dev->host_msix_entries[i].vector,
+                                (void *)assigned_dev);
+
+               assigned_dev->entries_nr = 0;
+               kfree(assigned_dev->host_msix_entries);
+               kfree(assigned_dev->guest_msix_entries);
+               pci_disable_msix(assigned_dev->dev);
+       } else {
+               /* Deal with MSI and INTx */
+               disable_irq_nosync(assigned_dev->host_irq);
+               cancel_work_sync(&assigned_dev->interrupt_work);
+
+               free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+               if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
+                       pci_disable_msi(assigned_dev->dev);
+       }
+
+       assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
+}
+
+static int kvm_deassign_irq(struct kvm *kvm,
+                           struct kvm_assigned_dev_kernel *assigned_dev,
+                           unsigned long irq_requested_type)
+{
+       unsigned long guest_irq_type, host_irq_type;
+
+       if (!irqchip_in_kernel(kvm))
+               return -EINVAL;
+       /* no irq assignment to deassign */
+       if (!assigned_dev->irq_requested_type)
+               return -ENXIO;
+
+       host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
+       guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
+
+       if (host_irq_type)
+               deassign_host_irq(kvm, assigned_dev);
+       if (guest_irq_type)
+               deassign_guest_irq(kvm, assigned_dev);
+
+       return 0;
+}
+
+static void kvm_free_assigned_irq(struct kvm *kvm,
+                                 struct kvm_assigned_dev_kernel *assigned_dev)
+{
+       kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
+}
+
+static void kvm_free_assigned_device(struct kvm *kvm,
+                                    struct kvm_assigned_dev_kernel
+                                    *assigned_dev)
+{
+       kvm_free_assigned_irq(kvm, assigned_dev);
+
+       pci_reset_function(assigned_dev->dev);
+
+       pci_release_regions(assigned_dev->dev);
+       pci_disable_device(assigned_dev->dev);
+       pci_dev_put(assigned_dev->dev);
+
+       list_del(&assigned_dev->list);
+       kfree(assigned_dev);
+}
+
+void kvm_free_all_assigned_devices(struct kvm *kvm)
+{
+       struct list_head *ptr, *ptr2;
+       struct kvm_assigned_dev_kernel *assigned_dev;
+
+       list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+               assigned_dev = list_entry(ptr,
+                                         struct kvm_assigned_dev_kernel,
+                                         list);
+
+               kvm_free_assigned_device(kvm, assigned_dev);
+       }
+}
+
+static int assigned_device_enable_host_intx(struct kvm *kvm,
+                                           struct kvm_assigned_dev_kernel *dev)
+{
+       dev->host_irq = dev->dev->irq;
+       /* Even though this is PCI, we don't want to use shared
+        * interrupts. Sharing host devices with guest-assigned devices
+        * on the same interrupt line is not a happy situation: there
+        * are going to be long delays in accepting, acking, etc.
+        */
+       if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
+                       0, "kvm_assigned_intx_device", (void *)dev))
+               return -EIO;
+       return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_host_msi(struct kvm *kvm,
+                                          struct kvm_assigned_dev_kernel *dev)
+{
+       int r;
+
+       if (!dev->dev->msi_enabled) {
+               r = pci_enable_msi(dev->dev);
+               if (r)
+                       return r;
+       }
+
+       dev->host_irq = dev->dev->irq;
+       if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
+                       "kvm_assigned_msi_device", (void *)dev)) {
+               pci_disable_msi(dev->dev);
+               return -EIO;
+       }
+
+       return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_host_msix(struct kvm *kvm,
+                                           struct kvm_assigned_dev_kernel *dev)
+{
+       int i, r = -EINVAL;
+
+       /* host_msix_entries and guest_msix_entries should have been
+        * initialized */
+       if (dev->entries_nr == 0)
+               return r;
+
+       r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
+       if (r)
+               return r;
+
+       for (i = 0; i < dev->entries_nr; i++) {
+               r = request_irq(dev->host_msix_entries[i].vector,
+                               kvm_assigned_dev_intr, 0,
+                               "kvm_assigned_msix_device",
+                               (void *)dev);
+               /* FIXME: free requested_irq's on failure */
+               if (r)
+                       return r;
+       }
+
+       return 0;
+}
+
+#endif
+
+static int assigned_device_enable_guest_intx(struct kvm *kvm,
+                               struct kvm_assigned_dev_kernel *dev,
+                               struct kvm_assigned_irq *irq)
+{
+       dev->guest_irq = irq->guest_irq;
+       dev->ack_notifier.gsi = irq->guest_irq;
+       return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_guest_msi(struct kvm *kvm,
+                       struct kvm_assigned_dev_kernel *dev,
+                       struct kvm_assigned_irq *irq)
+{
+       dev->guest_irq = irq->guest_irq;
+       dev->ack_notifier.gsi = -1;
+       return 0;
+}
+#endif
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_guest_msix(struct kvm *kvm,
+                       struct kvm_assigned_dev_kernel *dev,
+                       struct kvm_assigned_irq *irq)
+{
+       dev->guest_irq = irq->guest_irq;
+       dev->ack_notifier.gsi = -1;
+       return 0;
+}
+#endif
+
+static int assign_host_irq(struct kvm *kvm,
+                          struct kvm_assigned_dev_kernel *dev,
+                          __u32 host_irq_type)
+{
+       int r = -EEXIST;
+
+       if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
+               return r;
+
+       switch (host_irq_type) {
+       case KVM_DEV_IRQ_HOST_INTX:
+               r = assigned_device_enable_host_intx(kvm, dev);
+               break;
+#ifdef __KVM_HAVE_MSI
+       case KVM_DEV_IRQ_HOST_MSI:
+               r = assigned_device_enable_host_msi(kvm, dev);
+               break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+       case KVM_DEV_IRQ_HOST_MSIX:
+               r = assigned_device_enable_host_msix(kvm, dev);
+               break;
+#endif
+       default:
+               r = -EINVAL;
+       }
+
+       if (!r)
+               dev->irq_requested_type |= host_irq_type;
+
+       return r;
+}
+
+static int assign_guest_irq(struct kvm *kvm,
+                           struct kvm_assigned_dev_kernel *dev,
+                           struct kvm_assigned_irq *irq,
+                           unsigned long guest_irq_type)
+{
+       int id;
+       int r = -EEXIST;
+
+       if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
+               return r;
+
+       id = kvm_request_irq_source_id(kvm);
+       if (id < 0)
+               return id;
+
+       dev->irq_source_id = id;
+
+       switch (guest_irq_type) {
+       case KVM_DEV_IRQ_GUEST_INTX:
+               r = assigned_device_enable_guest_intx(kvm, dev, irq);
+               break;
+#ifdef __KVM_HAVE_MSI
+       case KVM_DEV_IRQ_GUEST_MSI:
+               r = assigned_device_enable_guest_msi(kvm, dev, irq);
+               break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+       case KVM_DEV_IRQ_GUEST_MSIX:
+               r = assigned_device_enable_guest_msix(kvm, dev, irq);
+               break;
+#endif
+       default:
+               r = -EINVAL;
+       }
+
+       if (!r) {
+               dev->irq_requested_type |= guest_irq_type;
+               kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+       } else
+               kvm_free_irq_source_id(kvm, dev->irq_source_id);
+
+       return r;
+}
+
+/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
+static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
+                                  struct kvm_assigned_irq *assigned_irq)
+{
+       int r = -EINVAL;
+       struct kvm_assigned_dev_kernel *match;
+       unsigned long host_irq_type, guest_irq_type;
+
+       if (!capable(CAP_SYS_RAWIO))
+               return -EPERM;
+
+       if (!irqchip_in_kernel(kvm))
+               return r;
+
+       mutex_lock(&kvm->lock);
+       r = -ENODEV;
+       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     assigned_irq->assigned_dev_id);
+       if (!match)
+               goto out;
+
+       host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
+       guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
+
+       r = -EINVAL;
+       /* can only assign one type at a time */
+       if (hweight_long(host_irq_type) > 1)
+               goto out;
+       if (hweight_long(guest_irq_type) > 1)
+               goto out;
+       if (host_irq_type == 0 && guest_irq_type == 0)
+               goto out;
+
+       r = 0;
+       if (host_irq_type)
+               r = assign_host_irq(kvm, match, host_irq_type);
+       if (r)
+               goto out;
+
+       if (guest_irq_type)
+               r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
+                                        struct kvm_assigned_irq
+                                        *assigned_irq)
+{
+       int r = -ENODEV;
+       struct kvm_assigned_dev_kernel *match;
+
+       mutex_lock(&kvm->lock);
+
+       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     assigned_irq->assigned_dev_id);
+       if (!match)
+               goto out;
+
+       r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
+                                     struct kvm_assigned_pci_dev *assigned_dev)
+{
+       int r = 0;
+       struct kvm_assigned_dev_kernel *match;
+       struct pci_dev *dev;
+
+       down_read(&kvm->slots_lock);
+       mutex_lock(&kvm->lock);
+
+       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     assigned_dev->assigned_dev_id);
+       if (match) {
+               /* device already assigned */
+               r = -EEXIST;
+               goto out;
+       }
+
+       match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+       if (match == NULL) {
+               printk(KERN_INFO "%s: Couldn't allocate memory\n",
+                      __func__);
+               r = -ENOMEM;
+               goto out;
+       }
+       dev = pci_get_bus_and_slot(assigned_dev->busnr,
+                                  assigned_dev->devfn);
+       if (!dev) {
+               printk(KERN_INFO "%s: host device not found\n", __func__);
+               r = -EINVAL;
+               goto out_free;
+       }
+       if (pci_enable_device(dev)) {
+               printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+               r = -EBUSY;
+               goto out_put;
+       }
+       r = pci_request_regions(dev, "kvm_assigned_device");
+       if (r) {
+               printk(KERN_INFO "%s: Could not get access to device regions\n",
+                      __func__);
+               goto out_disable;
+       }
+
+       pci_reset_function(dev);
+
+       match->assigned_dev_id = assigned_dev->assigned_dev_id;
+       match->host_busnr = assigned_dev->busnr;
+       match->host_devfn = assigned_dev->devfn;
+       match->flags = assigned_dev->flags;
+       match->dev = dev;
+       match->irq_source_id = -1;
+       match->kvm = kvm;
+       match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+       INIT_WORK(&match->interrupt_work,
+                 kvm_assigned_dev_interrupt_work_handler);
+
+       list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+       if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
+               if (!kvm->arch.iommu_domain) {
+                       r = kvm_iommu_map_guest(kvm);
+                       if (r)
+                               goto out_list_del;
+               }
+               r = kvm_assign_device(kvm, match);
+               if (r)
+                       goto out_list_del;
+       }
+
+out:
+       mutex_unlock(&kvm->lock);
+       up_read(&kvm->slots_lock);
+       return r;
+out_list_del:
+       list_del(&match->list);
+       pci_release_regions(dev);
+out_disable:
+       pci_disable_device(dev);
+out_put:
+       pci_dev_put(dev);
+out_free:
+       kfree(match);
+       mutex_unlock(&kvm->lock);
+       up_read(&kvm->slots_lock);
+       return r;
+}
+#endif
+
+#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
+static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
+               struct kvm_assigned_pci_dev *assigned_dev)
+{
+       int r = 0;
+       struct kvm_assigned_dev_kernel *match;
+
+       mutex_lock(&kvm->lock);
+
+       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     assigned_dev->assigned_dev_id);
+       if (!match) {
+               printk(KERN_INFO "%s: device hasn't been assigned before, "
+                 "so cannot be deassigned\n", __func__);
+               r = -EINVAL;
+               goto out;
+       }
+
+       if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
+               kvm_deassign_device(kvm, match);
+
+       kvm_free_assigned_device(kvm, match);
+
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+#endif
+
 static inline int valid_vcpu(int n)
 {
        return likely(n >= 0 && n < KVM_MAX_VCPUS);
 }
 
+inline int kvm_is_mmio_pfn(pfn_t pfn)
+{
+       if (pfn_valid(pfn)) {
+               struct page *page = compound_head(pfn_to_page(pfn));
+               return PageReserved(page);
+       }
+
+       return true;
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put()
  */
@@ -96,27 +724,47 @@ static void ack_flush(void *_completed)
 {
 }
 
-void kvm_flush_remote_tlbs(struct kvm *kvm)
+static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
-       int i, cpu;
-       cpumask_t cpus;
+       int i, cpu, me;
+       cpumask_var_t cpus;
+       bool called = true;
        struct kvm_vcpu *vcpu;
 
-       cpus_clear(cpus);
+       if (alloc_cpumask_var(&cpus, GFP_ATOMIC))
+               cpumask_clear(cpus);
+
+       me = get_cpu();
        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
                vcpu = kvm->vcpus[i];
                if (!vcpu)
                        continue;
-               if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+               if (test_and_set_bit(req, &vcpu->requests))
                        continue;
                cpu = vcpu->cpu;
-               if (cpu != -1 && cpu != raw_smp_processor_id())
-                       cpu_set(cpu, cpus);
+               if (cpus != NULL && cpu != -1 && cpu != me)
+                       cpumask_set_cpu(cpu, cpus);
        }
-       if (cpus_empty(cpus))
-               return;
-       ++kvm->stat.remote_tlb_flush;
-       smp_call_function_mask(cpus, ack_flush, NULL, 1);
+       if (unlikely(cpus == NULL))
+               smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
+       else if (!cpumask_empty(cpus))
+               smp_call_function_many(cpus, ack_flush, NULL, 1);
+       else
+               called = false;
+       put_cpu();
+       free_cpumask_var(cpus);
+       return called;
+}
+
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+       if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+               ++kvm->stat.remote_tlb_flush;
+}
+
+void kvm_reload_remote_mmus(struct kvm *kvm)
+{
+       make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
 
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
@@ -156,21 +804,184 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
 
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+       return container_of(mn, struct kvm, mmu_notifier);
+}
+
+static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+                                            struct mm_struct *mm,
+                                            unsigned long address)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       int need_tlb_flush;
+
+       /*
+        * When ->invalidate_page runs, the linux pte has been zapped
+        * already but the page is still allocated until
+        * ->invalidate_page returns. So if we increase the sequence
+        * here the kvm page fault will notice if the spte can't be
+        * established because the page is going to be freed. If
+        * instead the kvm page fault establishes the spte before
+        * ->invalidate_page runs, kvm_unmap_hva will release it
+        * before returning.
+        *
+        * The sequence increase only need to be seen at spin_unlock
+        * time, and not at spin_lock time.
+        *
+        * Increasing the sequence after the spin_unlock would be
+        * unsafe because the kvm page fault could then establish the
+        * pte after kvm_unmap_hva returned, without noticing the page
+        * is going to be freed.
+        */
+       spin_lock(&kvm->mmu_lock);
+       kvm->mmu_notifier_seq++;
+       need_tlb_flush = kvm_unmap_hva(kvm, address);
+       spin_unlock(&kvm->mmu_lock);
+
+       /* we've to flush the tlb before the pages can be freed */
+       if (need_tlb_flush)
+               kvm_flush_remote_tlbs(kvm);
+
+}
+
+static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+                                                   struct mm_struct *mm,
+                                                   unsigned long start,
+                                                   unsigned long end)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       int need_tlb_flush = 0;
+
+       spin_lock(&kvm->mmu_lock);
+       /*
+        * The count increase must become visible at unlock time as no
+        * spte can be established without taking the mmu_lock and
+        * count is also read inside the mmu_lock critical section.
+        */
+       kvm->mmu_notifier_count++;
+       for (; start < end; start += PAGE_SIZE)
+               need_tlb_flush |= kvm_unmap_hva(kvm, start);
+       spin_unlock(&kvm->mmu_lock);
+
+       /* we've to flush the tlb before the pages can be freed */
+       if (need_tlb_flush)
+               kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+                                                 struct mm_struct *mm,
+                                                 unsigned long start,
+                                                 unsigned long end)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+       spin_lock(&kvm->mmu_lock);
+       /*
+        * This sequence increase will notify the kvm page fault that
+        * the page that is going to be mapped in the spte could have
+        * been freed.
+        */
+       kvm->mmu_notifier_seq++;
+       /*
+        * The above sequence increase must be visible before the
+        * below count decrease but both values are read by the kvm
+        * page fault under mmu_lock spinlock so we don't need to add
+        * a smb_wmb() here in between the two.
+        */
+       kvm->mmu_notifier_count--;
+       spin_unlock(&kvm->mmu_lock);
+
+       BUG_ON(kvm->mmu_notifier_count < 0);
+}
+
+static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+                                             struct mm_struct *mm,
+                                             unsigned long address)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       int young;
+
+       spin_lock(&kvm->mmu_lock);
+       young = kvm_age_hva(kvm, address);
+       spin_unlock(&kvm->mmu_lock);
+
+       if (young)
+               kvm_flush_remote_tlbs(kvm);
+
+       return young;
+}
+
+static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
+                                    struct mm_struct *mm)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       kvm_arch_flush_shadow(kvm);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+       .invalidate_page        = kvm_mmu_notifier_invalidate_page,
+       .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
+       .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
+       .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
+       .release                = kvm_mmu_notifier_release,
+};
+#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
+
 static struct kvm *kvm_create_vm(void)
 {
        struct kvm *kvm = kvm_arch_create_vm();
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+       struct page *page;
+#endif
 
        if (IS_ERR(kvm))
                goto out;
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+       INIT_LIST_HEAD(&kvm->irq_routing);
+       INIT_HLIST_HEAD(&kvm->mask_notifier_list);
+#endif
+
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+       if (!page) {
+               kfree(kvm);
+               return ERR_PTR(-ENOMEM);
+       }
+       kvm->coalesced_mmio_ring =
+                       (struct kvm_coalesced_mmio_ring *)page_address(page);
+#endif
+
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+       {
+               int err;
+               kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
+               err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
+               if (err) {
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+                       put_page(page);
+#endif
+                       kfree(kvm);
+                       return ERR_PTR(err);
+               }
+       }
+#endif
 
        kvm->mm = current->mm;
        atomic_inc(&kvm->mm->mm_count);
+       spin_lock_init(&kvm->mmu_lock);
        kvm_io_bus_init(&kvm->pio_bus);
        mutex_init(&kvm->lock);
        kvm_io_bus_init(&kvm->mmio_bus);
+       init_rwsem(&kvm->slots_lock);
+       atomic_set(&kvm->users_count, 1);
        spin_lock(&kvm_lock);
        list_add(&kvm->vm_list, &vm_list);
        spin_unlock(&kvm_lock);
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+       kvm_coalesced_mmio_init(kvm);
+#endif
 out:
        return kvm;
 }
@@ -187,9 +998,13 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
        if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
                vfree(free->dirty_bitmap);
 
+       if (!dont || free->lpage_info != dont->lpage_info)
+               vfree(free->lpage_info);
+
        free->npages = 0;
        free->dirty_bitmap = NULL;
        free->rmap = NULL;
+       free->lpage_info = NULL;
 }
 
 void kvm_free_physmem(struct kvm *kvm)
@@ -204,20 +1019,45 @@ static void kvm_destroy_vm(struct kvm *kvm)
 {
        struct mm_struct *mm = kvm->mm;
 
+       kvm_arch_sync_events(kvm);
        spin_lock(&kvm_lock);
        list_del(&kvm->vm_list);
        spin_unlock(&kvm_lock);
+       kvm_free_irq_routing(kvm);
        kvm_io_bus_destroy(&kvm->pio_bus);
        kvm_io_bus_destroy(&kvm->mmio_bus);
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+       if (kvm->coalesced_mmio_ring != NULL)
+               free_page((unsigned long)kvm->coalesced_mmio_ring);
+#endif
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+       mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+#else
+       kvm_arch_flush_shadow(kvm);
+#endif
        kvm_arch_destroy_vm(kvm);
        mmdrop(mm);
 }
 
+void kvm_get_kvm(struct kvm *kvm)
+{
+       atomic_inc(&kvm->users_count);
+}
+EXPORT_SYMBOL_GPL(kvm_get_kvm);
+
+void kvm_put_kvm(struct kvm *kvm)
+{
+       if (atomic_dec_and_test(&kvm->users_count))
+               kvm_destroy_vm(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_put_kvm);
+
+
 static int kvm_vm_release(struct inode *inode, struct file *filp)
 {
        struct kvm *kvm = filp->private_data;
 
-       kvm_destroy_vm(kvm);
+       kvm_put_kvm(kvm);
        return 0;
 }
 
@@ -236,6 +1076,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
        int r;
        gfn_t base_gfn;
        unsigned long npages;
+       int largepages;
        unsigned long i;
        struct kvm_memory_slot *memslot;
        struct kvm_memory_slot old, new;
@@ -246,6 +1087,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
                goto out;
        if (mem->guest_phys_addr & (PAGE_SIZE - 1))
                goto out;
+       if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1)))
+               goto out;
        if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
                goto out;
        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
@@ -274,7 +1117,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
        for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
                struct kvm_memory_slot *s = &kvm->memslots[i];
 
-               if (s == memslot)
+               if (s == memslot || !s->npages)
                        continue;
                if (!((base_gfn + npages <= s->base_gfn) ||
                      (base_gfn >= s->base_gfn + s->npages)))
@@ -288,6 +1131,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
        r = -ENOMEM;
 
        /* Allocate if a slot is being created */
+#ifndef CONFIG_S390
        if (npages && !new.rmap) {
                new.rmap = vmalloc(npages * sizeof(struct page *));
 
@@ -297,7 +1141,31 @@ int __kvm_set_memory_region(struct kvm *kvm,
                memset(new.rmap, 0, npages * sizeof(*new.rmap));
 
                new.user_alloc = user_alloc;
-               new.userspace_addr = mem->userspace_addr;
+               /*
+                * hva_to_rmmap() serialzies with the mmu_lock and to be
+                * safe it has to ignore memslots with !user_alloc &&
+                * !userspace_addr.
+                */
+               if (user_alloc)
+                       new.userspace_addr = mem->userspace_addr;
+               else
+                       new.userspace_addr = 0;
+       }
+       if (npages && !new.lpage_info) {
+               largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE;
+               largepages -= base_gfn / KVM_PAGES_PER_HPAGE;
+
+               new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info));
+
+               if (!new.lpage_info)
+                       goto out_free;
+
+               memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info));
+
+               if (base_gfn % KVM_PAGES_PER_HPAGE)
+                       new.lpage_info[0].write_count = 1;
+               if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE)
+                       new.lpage_info[largepages-1].write_count = 1;
        }
 
        /* Allocate page dirty bitmap if needed */
@@ -309,19 +1177,36 @@ int __kvm_set_memory_region(struct kvm *kvm,
                        goto out_free;
                memset(new.dirty_bitmap, 0, dirty_bytes);
        }
+#endif /* not defined CONFIG_S390 */
 
+       if (!npages)
+               kvm_arch_flush_shadow(kvm);
+
+       spin_lock(&kvm->mmu_lock);
        if (mem->slot >= kvm->nmemslots)
                kvm->nmemslots = mem->slot + 1;
 
        *memslot = new;
+       spin_unlock(&kvm->mmu_lock);
 
        r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
        if (r) {
+               spin_lock(&kvm->mmu_lock);
                *memslot = old;
+               spin_unlock(&kvm->mmu_lock);
                goto out_free;
        }
 
-       kvm_free_physmem_slot(&old, &new);
+       kvm_free_physmem_slot(&old, npages ? &new : NULL);
+       /* Slot deletion case: we have to update the current slot */
+       if (!npages)
+               *memslot = old;
+#ifdef CONFIG_DMAR
+       /* map the pages in iommu page table */
+       r = kvm_iommu_map_pages(kvm, base_gfn, npages);
+       if (r)
+               goto out;
+#endif
        return 0;
 
 out_free:
@@ -338,9 +1223,9 @@ int kvm_set_memory_region(struct kvm *kvm,
 {
        int r;
 
-       down_write(&current->mm->mmap_sem);
+       down_write(&kvm->slots_lock);
        r = __kvm_set_memory_region(kvm, mem, user_alloc);
-       up_write(&current->mm->mmap_sem);
+       up_write(&kvm->slots_lock);
        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
@@ -395,6 +1280,12 @@ int is_error_page(struct page *page)
 }
 EXPORT_SYMBOL_GPL(is_error_page);
 
+int is_error_pfn(pfn_t pfn)
+{
+       return pfn == bad_pfn;
+}
+EXPORT_SYMBOL_GPL(is_error_pfn);
+
 static inline unsigned long bad_hva(void)
 {
        return PAGE_OFFSET;
@@ -406,7 +1297,7 @@ int kvm_is_error_hva(unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
 {
        int i;
 
@@ -419,11 +1310,12 @@ static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
        }
        return NULL;
 }
+EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased);
 
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
        gfn = unalias_gfn(kvm, gfn);
-       return __gfn_to_memslot(kvm, gfn);
+       return gfn_to_memslot_unaliased(kvm, gfn);
 }
 
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
@@ -442,61 +1334,131 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
 
-static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
        struct kvm_memory_slot *slot;
 
        gfn = unalias_gfn(kvm, gfn);
-       slot = __gfn_to_memslot(kvm, gfn);
+       slot = gfn_to_memslot_unaliased(kvm, gfn);
        if (!slot)
                return bad_hva();
        return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
 }
+EXPORT_SYMBOL_GPL(gfn_to_hva);
 
-/*
- * Requires current->mm->mmap_sem to be held
- */
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
        struct page *page[1];
        unsigned long addr;
        int npages;
+       pfn_t pfn;
 
        might_sleep();
 
        addr = gfn_to_hva(kvm, gfn);
        if (kvm_is_error_hva(addr)) {
                get_page(bad_page);
-               return bad_page;
+               return page_to_pfn(bad_page);
        }
 
-       npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
-                               NULL);
+       npages = get_user_pages_fast(addr, 1, 1, page);
 
-       if (npages != 1) {
-               get_page(bad_page);
-               return bad_page;
-       }
+       if (unlikely(npages != 1)) {
+               struct vm_area_struct *vma;
+
+               down_read(&current->mm->mmap_sem);
+               vma = find_vma(current->mm, addr);
 
-       return page[0];
+               if (vma == NULL || addr < vma->vm_start ||
+                   !(vma->vm_flags & VM_PFNMAP)) {
+                       up_read(&current->mm->mmap_sem);
+                       get_page(bad_page);
+                       return page_to_pfn(bad_page);
+               }
+
+               pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+               up_read(&current->mm->mmap_sem);
+               BUG_ON(!kvm_is_mmio_pfn(pfn));
+       } else
+               pfn = page_to_pfn(page[0]);
+
+       return pfn;
+}
+
+EXPORT_SYMBOL_GPL(gfn_to_pfn);
+
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+{
+       pfn_t pfn;
+
+       pfn = gfn_to_pfn(kvm, gfn);
+       if (!kvm_is_mmio_pfn(pfn))
+               return pfn_to_page(pfn);
+
+       WARN_ON(kvm_is_mmio_pfn(pfn));
+
+       get_page(bad_page);
+       return bad_page;
 }
 
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
 void kvm_release_page_clean(struct page *page)
 {
-       put_page(page);
+       kvm_release_pfn_clean(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
+void kvm_release_pfn_clean(pfn_t pfn)
+{
+       if (!kvm_is_mmio_pfn(pfn))
+               put_page(pfn_to_page(pfn));
+}
+EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
+
 void kvm_release_page_dirty(struct page *page)
 {
-       if (!PageReserved(page))
-               SetPageDirty(page);
-       put_page(page);
+       kvm_release_pfn_dirty(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
 
+void kvm_release_pfn_dirty(pfn_t pfn)
+{
+       kvm_set_pfn_dirty(pfn);
+       kvm_release_pfn_clean(pfn);
+}
+EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
+
+void kvm_set_page_dirty(struct page *page)
+{
+       kvm_set_pfn_dirty(page_to_pfn(page));
+}
+EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
+
+void kvm_set_pfn_dirty(pfn_t pfn)
+{
+       if (!kvm_is_mmio_pfn(pfn)) {
+               struct page *page = pfn_to_page(pfn);
+               if (!PageReserved(page))
+                       SetPageDirty(page);
+       }
+}
+EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
+
+void kvm_set_pfn_accessed(pfn_t pfn)
+{
+       if (!kvm_is_mmio_pfn(pfn))
+               mark_page_accessed(pfn_to_page(pfn));
+}
+EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
+
+void kvm_get_pfn(pfn_t pfn)
+{
+       if (!kvm_is_mmio_pfn(pfn))
+               get_page(pfn_to_page(pfn));
+}
+EXPORT_SYMBOL_GPL(kvm_get_pfn);
+
 static int next_segment(unsigned long len, int offset)
 {
        if (len > PAGE_SIZE - offset)
@@ -541,6 +1503,26 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest);
 
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+                         unsigned long len)
+{
+       int r;
+       unsigned long addr;
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       int offset = offset_in_page(gpa);
+
+       addr = gfn_to_hva(kvm, gfn);
+       if (kvm_is_error_hva(addr))
+               return -EFAULT;
+       pagefault_disable();
+       r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+       pagefault_enable();
+       if (r)
+               return -EFAULT;
+       return 0;
+}
+EXPORT_SYMBOL(kvm_read_guest_atomic);
+
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
                         int offset, int len)
 {
@@ -608,7 +1590,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
        struct kvm_memory_slot *memslot;
 
        gfn = unalias_gfn(kvm, gfn);
-       memslot = __gfn_to_memslot(kvm, gfn);
+       memslot = gfn_to_memslot_unaliased(kvm, gfn);
        if (memslot && memslot->dirty_bitmap) {
                unsigned long rel_gfn = gfn - memslot->base_gfn;
 
@@ -623,24 +1605,26 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
  */
 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
-       DECLARE_WAITQUEUE(wait, current);
+       DEFINE_WAIT(wait);
 
-       add_wait_queue(&vcpu->wq, &wait);
+       for (;;) {
+               prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+
+               if (kvm_cpu_has_interrupt(vcpu) ||
+                   kvm_cpu_has_pending_timer(vcpu) ||
+                   kvm_arch_vcpu_runnable(vcpu)) {
+                       set_bit(KVM_REQ_UNHALT, &vcpu->requests);
+                       break;
+               }
+               if (signal_pending(current))
+                       break;
 
-       /*
-        * We will block until either an interrupt or a signal wakes us up
-        */
-       while (!kvm_cpu_has_interrupt(vcpu)
-              && !signal_pending(current)
-              && !kvm_arch_vcpu_runnable(vcpu)) {
-               set_current_state(TASK_INTERRUPTIBLE);
                vcpu_put(vcpu);
                schedule();
                vcpu_load(vcpu);
        }
 
-       __set_current_state(TASK_RUNNING);
-       remove_wait_queue(&vcpu->wq, &wait);
+       finish_wait(&vcpu->wq, &wait);
 }
 
 void kvm_resched(struct kvm_vcpu *vcpu)
@@ -658,8 +1642,14 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
        if (vmf->pgoff == 0)
                page = virt_to_page(vcpu->run);
+#ifdef CONFIG_X86
        else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
                page = virt_to_page(vcpu->arch.pio_data);
+#endif
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+       else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
+               page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
+#endif
        else
                return VM_FAULT_SIGBUS;
        get_page(page);
@@ -681,7 +1671,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 {
        struct kvm_vcpu *vcpu = filp->private_data;
 
-       fput(vcpu->kvm->filp);
+       kvm_put_kvm(vcpu->kvm);
        return 0;
 }
 
@@ -697,15 +1687,9 @@ static struct file_operations kvm_vcpu_fops = {
  */
 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 {
-       int fd, r;
-       struct inode *inode;
-       struct file *file;
-
-       r = anon_inode_getfd(&fd, &inode, &file,
-                            "kvm-vcpu", &kvm_vcpu_fops, vcpu);
-       if (r)
-               return r;
-       atomic_inc(&vcpu->kvm->filp->f_count);
+       int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0);
+       if (fd < 0)
+               kvm_put_kvm(vcpu->kvm);
        return fd;
 }
 
@@ -728,18 +1712,18 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 
        r = kvm_arch_vcpu_setup(vcpu);
        if (r)
-               goto vcpu_destroy;
+               return r;
 
        mutex_lock(&kvm->lock);
        if (kvm->vcpus[n]) {
                r = -EEXIST;
-               mutex_unlock(&kvm->lock);
                goto vcpu_destroy;
        }
        kvm->vcpus[n] = vcpu;
        mutex_unlock(&kvm->lock);
 
        /* Now it's all set up, let userspace reach it */
+       kvm_get_kvm(kvm);
        r = create_vcpu_fd(vcpu);
        if (r < 0)
                goto unlink;
@@ -748,8 +1732,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 unlink:
        mutex_lock(&kvm->lock);
        kvm->vcpus[n] = NULL;
-       mutex_unlock(&kvm->lock);
 vcpu_destroy:
+       mutex_unlock(&kvm->lock);
        kvm_arch_vcpu_destroy(vcpu);
        return r;
 }
@@ -765,12 +1749,96 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
        return 0;
 }
 
+#ifdef __KVM_HAVE_MSIX
+static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
+                                   struct kvm_assigned_msix_nr *entry_nr)
+{
+       int r = 0;
+       struct kvm_assigned_dev_kernel *adev;
+
+       mutex_lock(&kvm->lock);
+
+       adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     entry_nr->assigned_dev_id);
+       if (!adev) {
+               r = -EINVAL;
+               goto msix_nr_out;
+       }
+
+       if (adev->entries_nr == 0) {
+               adev->entries_nr = entry_nr->entry_nr;
+               if (adev->entries_nr == 0 ||
+                   adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
+                       r = -EINVAL;
+                       goto msix_nr_out;
+               }
+
+               adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
+                                               entry_nr->entry_nr,
+                                               GFP_KERNEL);
+               if (!adev->host_msix_entries) {
+                       r = -ENOMEM;
+                       goto msix_nr_out;
+               }
+               adev->guest_msix_entries = kzalloc(
+                               sizeof(struct kvm_guest_msix_entry) *
+                               entry_nr->entry_nr, GFP_KERNEL);
+               if (!adev->guest_msix_entries) {
+                       kfree(adev->host_msix_entries);
+                       r = -ENOMEM;
+                       goto msix_nr_out;
+               }
+       } else /* Not allowed set MSI-X number twice */
+               r = -EINVAL;
+msix_nr_out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
+                                      struct kvm_assigned_msix_entry *entry)
+{
+       int r = 0, i;
+       struct kvm_assigned_dev_kernel *adev;
+
+       mutex_lock(&kvm->lock);
+
+       adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     entry->assigned_dev_id);
+
+       if (!adev) {
+               r = -EINVAL;
+               goto msix_entry_out;
+       }
+
+       for (i = 0; i < adev->entries_nr; i++)
+               if (adev->guest_msix_entries[i].vector == 0 ||
+                   adev->guest_msix_entries[i].entry == entry->entry) {
+                       adev->guest_msix_entries[i].entry = entry->entry;
+                       adev->guest_msix_entries[i].vector = entry->gsi;
+                       adev->host_msix_entries[i].entry = entry->entry;
+                       break;
+               }
+       if (i == adev->entries_nr) {
+               r = -ENOSPC;
+               goto msix_entry_out;
+       }
+
+msix_entry_out:
+       mutex_unlock(&kvm->lock);
+
+       return r;
+}
+#endif
+
 static long kvm_vcpu_ioctl(struct file *filp,
                           unsigned int ioctl, unsigned long arg)
 {
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = (void __user *)arg;
        int r;
+       struct kvm_fpu *fpu = NULL;
+       struct kvm_sregs *kvm_sregs = NULL;
 
        if (vcpu->kvm->mm != current->mm)
                return -EIO;
@@ -782,50 +1850,88 @@ static long kvm_vcpu_ioctl(struct file *filp,
                r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
                break;
        case KVM_GET_REGS: {
-               struct kvm_regs kvm_regs;
+               struct kvm_regs *kvm_regs;
 
-               memset(&kvm_regs, 0, sizeof kvm_regs);
-               r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
-               if (r)
+               r = -ENOMEM;
+               kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
+               if (!kvm_regs)
                        goto out;
+               r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
+               if (r)
+                       goto out_free1;
                r = -EFAULT;
-               if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
-                       goto out;
+               if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
+                       goto out_free1;
                r = 0;
+out_free1:
+               kfree(kvm_regs);
                break;
        }
        case KVM_SET_REGS: {
-               struct kvm_regs kvm_regs;
+               struct kvm_regs *kvm_regs;
 
+               r = -ENOMEM;
+               kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
+               if (!kvm_regs)
+                       goto out;
+               r = -EFAULT;
+               if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
+                       goto out_free2;
+               r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
+               if (r)
+                       goto out_free2;
+               r = 0;
+out_free2:
+               kfree(kvm_regs);
+               break;
+       }
+       case KVM_GET_SREGS: {
+               kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!kvm_sregs)
+                       goto out;
+               r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_SREGS: {
+               kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!kvm_sregs)
+                       goto out;
                r = -EFAULT;
-               if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
+               if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
                        goto out;
-               r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
+               r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
                if (r)
                        goto out;
                r = 0;
                break;
        }
-       case KVM_GET_SREGS: {
-               struct kvm_sregs kvm_sregs;
+       case KVM_GET_MP_STATE: {
+               struct kvm_mp_state mp_state;
 
-               memset(&kvm_sregs, 0, sizeof kvm_sregs);
-               r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
+               r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
                if (r)
                        goto out;
                r = -EFAULT;
-               if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
+               if (copy_to_user(argp, &mp_state, sizeof mp_state))
                        goto out;
                r = 0;
                break;
        }
-       case KVM_SET_SREGS: {
-               struct kvm_sregs kvm_sregs;
+       case KVM_SET_MP_STATE: {
+               struct kvm_mp_state mp_state;
 
                r = -EFAULT;
-               if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
+               if (copy_from_user(&mp_state, argp, sizeof mp_state))
                        goto out;
-               r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
+               r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
                if (r)
                        goto out;
                r = 0;
@@ -846,13 +1952,13 @@ static long kvm_vcpu_ioctl(struct file *filp,
                r = 0;
                break;
        }
-       case KVM_DEBUG_GUEST: {
-               struct kvm_debug_guest dbg;
+       case KVM_SET_GUEST_DEBUG: {
+               struct kvm_guest_debug dbg;
 
                r = -EFAULT;
                if (copy_from_user(&dbg, argp, sizeof dbg))
                        goto out;
-               r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
+               r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
                if (r)
                        goto out;
                r = 0;
@@ -882,25 +1988,28 @@ static long kvm_vcpu_ioctl(struct file *filp,
                break;
        }
        case KVM_GET_FPU: {
-               struct kvm_fpu fpu;
-
-               memset(&fpu, 0, sizeof fpu);
-               r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
+               fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!fpu)
+                       goto out;
+               r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
                if (r)
                        goto out;
                r = -EFAULT;
-               if (copy_to_user(argp, &fpu, sizeof fpu))
+               if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
                        goto out;
                r = 0;
                break;
        }
        case KVM_SET_FPU: {
-               struct kvm_fpu fpu;
-
+               fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!fpu)
+                       goto out;
                r = -EFAULT;
-               if (copy_from_user(&fpu, argp, sizeof fpu))
+               if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
                        goto out;
-               r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
+               r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
                if (r)
                        goto out;
                r = 0;
@@ -910,6 +2019,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
                r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
        }
 out:
+       kfree(fpu);
+       kfree(kvm_sregs);
        return r;
 }
 
@@ -952,6 +2063,138 @@ static long kvm_vm_ioctl(struct file *filp,
                        goto out;
                break;
        }
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+       case KVM_REGISTER_COALESCED_MMIO: {
+               struct kvm_coalesced_mmio_zone zone;
+               r = -EFAULT;
+               if (copy_from_user(&zone, argp, sizeof zone))
+                       goto out;
+               r = -ENXIO;
+               r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_UNREGISTER_COALESCED_MMIO: {
+               struct kvm_coalesced_mmio_zone zone;
+               r = -EFAULT;
+               if (copy_from_user(&zone, argp, sizeof zone))
+                       goto out;
+               r = -ENXIO;
+               r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+#endif
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+       case KVM_ASSIGN_PCI_DEVICE: {
+               struct kvm_assigned_pci_dev assigned_dev;
+
+               r = -EFAULT;
+               if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+                       goto out;
+               r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_ASSIGN_IRQ: {
+               r = -EOPNOTSUPP;
+               break;
+       }
+#ifdef KVM_CAP_ASSIGN_DEV_IRQ
+       case KVM_ASSIGN_DEV_IRQ: {
+               struct kvm_assigned_irq assigned_irq;
+
+               r = -EFAULT;
+               if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+                       goto out;
+               r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_DEASSIGN_DEV_IRQ: {
+               struct kvm_assigned_irq assigned_irq;
+
+               r = -EFAULT;
+               if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+                       goto out;
+               r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
+               if (r)
+                       goto out;
+               break;
+       }
+#endif
+#endif
+#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
+       case KVM_DEASSIGN_PCI_DEVICE: {
+               struct kvm_assigned_pci_dev assigned_dev;
+
+               r = -EFAULT;
+               if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+                       goto out;
+               r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
+               if (r)
+                       goto out;
+               break;
+       }
+#endif
+#ifdef KVM_CAP_IRQ_ROUTING
+       case KVM_SET_GSI_ROUTING: {
+               struct kvm_irq_routing routing;
+               struct kvm_irq_routing __user *urouting;
+               struct kvm_irq_routing_entry *entries;
+
+               r = -EFAULT;
+               if (copy_from_user(&routing, argp, sizeof(routing)))
+                       goto out;
+               r = -EINVAL;
+               if (routing.nr >= KVM_MAX_IRQ_ROUTES)
+                       goto out;
+               if (routing.flags)
+                       goto out;
+               r = -ENOMEM;
+               entries = vmalloc(routing.nr * sizeof(*entries));
+               if (!entries)
+                       goto out;
+               r = -EFAULT;
+               urouting = argp;
+               if (copy_from_user(entries, urouting->entries,
+                                  routing.nr * sizeof(*entries)))
+                       goto out_free_irq_routing;
+               r = kvm_set_irq_routing(kvm, entries, routing.nr,
+                                       routing.flags);
+       out_free_irq_routing:
+               vfree(entries);
+               break;
+       }
+#ifdef __KVM_HAVE_MSIX
+       case KVM_ASSIGN_SET_MSIX_NR: {
+               struct kvm_assigned_msix_nr entry_nr;
+               r = -EFAULT;
+               if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
+                       goto out;
+               r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_ASSIGN_SET_MSIX_ENTRY: {
+               struct kvm_assigned_msix_entry entry;
+               r = -EFAULT;
+               if (copy_from_user(&entry, argp, sizeof entry))
+                       goto out;
+               r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
+               if (r)
+                       goto out;
+               break;
+       }
+#endif
+#endif /* KVM_CAP_IRQ_ROUTING */
        default:
                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
        }
@@ -961,17 +2204,22 @@ out:
 
 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+       struct page *page[1];
+       unsigned long addr;
+       int npages;
+       gfn_t gfn = vmf->pgoff;
        struct kvm *kvm = vma->vm_file->private_data;
-       struct page *page;
 
-       if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
+       addr = gfn_to_hva(kvm, gfn);
+       if (kvm_is_error_hva(addr))
                return VM_FAULT_SIGBUS;
-       page = gfn_to_page(kvm, vmf->pgoff);
-       if (is_error_page(page)) {
-               kvm_release_page_clean(page);
+
+       npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
+                               NULL);
+       if (unlikely(npages != 1))
                return VM_FAULT_SIGBUS;
-       }
-       vmf->page = page;
+
+       vmf->page = page[0];
        return 0;
 }
 
@@ -994,29 +2242,39 @@ static struct file_operations kvm_vm_fops = {
 
 static int kvm_dev_ioctl_create_vm(void)
 {
-       int fd, r;
-       struct inode *inode;
-       struct file *file;
+       int fd;
        struct kvm *kvm;
 
        kvm = kvm_create_vm();
        if (IS_ERR(kvm))
                return PTR_ERR(kvm);
-       r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
-       if (r) {
-               kvm_destroy_vm(kvm);
-               return r;
-       }
-
-       kvm->filp = file;
+       fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0);
+       if (fd < 0)
+               kvm_put_kvm(kvm);
 
        return fd;
 }
 
+static long kvm_dev_ioctl_check_extension_generic(long arg)
+{
+       switch (arg) {
+       case KVM_CAP_USER_MEMORY:
+       case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+       case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
+               return 1;
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+       case KVM_CAP_IRQ_ROUTING:
+               return KVM_MAX_IRQ_ROUTES;
+#endif
+       default:
+               break;
+       }
+       return kvm_dev_ioctl_check_extension(arg);
+}
+
 static long kvm_dev_ioctl(struct file *filp,
                          unsigned int ioctl, unsigned long arg)
 {
-       void __user *argp = (void __user *)arg;
        long r = -EINVAL;
 
        switch (ioctl) {
@@ -1033,13 +2291,24 @@ static long kvm_dev_ioctl(struct file *filp,
                r = kvm_dev_ioctl_create_vm();
                break;
        case KVM_CHECK_EXTENSION:
-               r = kvm_dev_ioctl_check_extension((long)argp);
+               r = kvm_dev_ioctl_check_extension_generic(arg);
                break;
        case KVM_GET_VCPU_MMAP_SIZE:
                r = -EINVAL;
                if (arg)
                        goto out;
-               r = 2 * PAGE_SIZE;
+               r = PAGE_SIZE;     /* struct kvm_run */
+#ifdef CONFIG_X86
+               r += PAGE_SIZE;    /* pio data page */
+#endif
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+               r += PAGE_SIZE;    /* coalesced mmio ring page */
+#endif
+               break;
+       case KVM_TRACE_ENABLE:
+       case KVM_TRACE_PAUSE:
+       case KVM_TRACE_DISABLE:
+               r = kvm_trace_ioctl(ioctl, arg);
                break;
        default:
                return kvm_arch_dev_ioctl(filp, ioctl, arg);
@@ -1063,9 +2332,9 @@ static void hardware_enable(void *junk)
 {
        int cpu = raw_smp_processor_id();
 
-       if (cpu_isset(cpu, cpus_hardware_enabled))
+       if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
                return;
-       cpu_set(cpu, cpus_hardware_enabled);
+       cpumask_set_cpu(cpu, cpus_hardware_enabled);
        kvm_arch_hardware_enable(NULL);
 }
 
@@ -1073,10 +2342,9 @@ static void hardware_disable(void *junk)
 {
        int cpu = raw_smp_processor_id();
 
-       if (!cpu_isset(cpu, cpus_hardware_enabled))
+       if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
                return;
-       cpu_clear(cpu, cpus_hardware_enabled);
-       decache_vcpus_on_cpu(cpu);
+       cpumask_clear_cpu(cpu, cpus_hardware_enabled);
        kvm_arch_hardware_disable(NULL);
 }
 
@@ -1095,17 +2363,29 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
        case CPU_UP_CANCELED:
                printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
                       cpu);
-               smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
+               smp_call_function_single(cpu, hardware_disable, NULL, 1);
                break;
        case CPU_ONLINE:
                printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
                       cpu);
-               smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
+               smp_call_function_single(cpu, hardware_enable, NULL, 1);
                break;
        }
        return NOTIFY_OK;
 }
 
+
+asmlinkage void kvm_handle_fault_on_reboot(void)
+{
+       if (kvm_rebooting)
+               /* spin while reset goes on */
+               while (true)
+                       ;
+       /* Fault while not rebooting.  We want the trace. */
+       BUG();
+}
+EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
+
 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
                      void *v)
 {
@@ -1115,7 +2395,8 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
                 * in vmx root mode.
                 */
                printk(KERN_INFO "kvm: exiting hardware virtualization\n");
-               on_each_cpu(hardware_disable, NULL, 0, 1);
+               kvm_rebooting = true;
+               on_each_cpu(hardware_disable, NULL, 1);
        }
        return NOTIFY_OK;
 }
@@ -1141,14 +2422,15 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus)
        }
 }
 
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
+                                         gpa_t addr, int len, int is_write)
 {
        int i;
 
        for (i = 0; i < bus->dev_count; i++) {
                struct kvm_io_device *pos = bus->devs[i];
 
-               if (pos->in_range(pos, addr))
+               if (pos->in_range(pos, addr, len, is_write))
                        return pos;
        }
 
@@ -1167,38 +2449,38 @@ static struct notifier_block kvm_cpu_notifier = {
        .priority = 20, /* must be > scheduler priority */
 };
 
-static u64 vm_stat_get(void *_offset)
+static int vm_stat_get(void *_offset, u64 *val)
 {
        unsigned offset = (long)_offset;
-       u64 total = 0;
        struct kvm *kvm;
 
+       *val = 0;
        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
-               total += *(u32 *)((void *)kvm + offset);
+               *val += *(u32 *)((void *)kvm + offset);
        spin_unlock(&kvm_lock);
-       return total;
+       return 0;
 }
 
 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
 
-static u64 vcpu_stat_get(void *_offset)
+static int vcpu_stat_get(void *_offset, u64 *val)
 {
        unsigned offset = (long)_offset;
-       u64 total = 0;
        struct kvm *kvm;
        struct kvm_vcpu *vcpu;
        int i;
 
+       *val = 0;
        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
                for (i = 0; i < KVM_MAX_VCPUS; ++i) {
                        vcpu = kvm->vcpus[i];
                        if (vcpu)
-                               total += *(u32 *)((void *)vcpu + offset);
+                               *val += *(u32 *)((void *)vcpu + offset);
                }
        spin_unlock(&kvm_lock);
-       return total;
+       return 0;
 }
 
 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
@@ -1212,9 +2494,9 @@ static void kvm_init_debug(void)
 {
        struct kvm_stats_debugfs_item *p;
 
-       debugfs_dir = debugfs_create_dir("kvm", NULL);
+       kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
        for (p = debugfs_entries; p->name; ++p)
-               p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
+               p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
                                                (void *)(long)p->offset,
                                                stat_fops[p->kind]);
 }
@@ -1225,7 +2507,7 @@ static void kvm_exit_debug(void)
 
        for (p = debugfs_entries; p->name; ++p)
                debugfs_remove(p->dentry);
-       debugfs_remove(debugfs_dir);
+       debugfs_remove(kvm_debugfs_dir);
 }
 
 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
@@ -1252,6 +2534,7 @@ static struct sys_device kvm_sysdev = {
 };
 
 struct page *bad_page;
+pfn_t bad_pfn;
 
 static inline
 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
@@ -1293,19 +2576,27 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
                goto out;
        }
 
+       bad_pfn = page_to_pfn(bad_page);
+
+       if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
+               r = -ENOMEM;
+               goto out_free_0;
+       }
+       cpumask_clear(cpus_hardware_enabled);
+
        r = kvm_arch_hardware_setup();
        if (r < 0)
-               goto out_free_0;
+               goto out_free_0a;
 
        for_each_online_cpu(cpu) {
                smp_call_function_single(cpu,
                                kvm_arch_check_processor_compat,
-                               &r, 0, 1);
+                               &r, 1);
                if (r < 0)
                        goto out_free_1;
        }
 
-       on_each_cpu(hardware_enable, NULL, 0, 1);
+       on_each_cpu(hardware_enable, NULL, 1);
        r = register_cpu_notifier(&kvm_cpu_notifier);
        if (r)
                goto out_free_2;
@@ -1329,6 +2620,8 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
        }
 
        kvm_chardev_ops.owner = module;
+       kvm_vm_fops.owner = module;
+       kvm_vcpu_fops.owner = module;
 
        r = misc_register(&kvm_dev);
        if (r) {
@@ -1351,9 +2644,11 @@ out_free_3:
        unregister_reboot_notifier(&kvm_reboot_notifier);
        unregister_cpu_notifier(&kvm_cpu_notifier);
 out_free_2:
-       on_each_cpu(hardware_disable, NULL, 0, 1);
+       on_each_cpu(hardware_disable, NULL, 1);
 out_free_1:
        kvm_arch_hardware_unsetup();
+out_free_0a:
+       free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
        __free_page(bad_page);
 out:
@@ -1366,16 +2661,18 @@ EXPORT_SYMBOL_GPL(kvm_init);
 
 void kvm_exit(void)
 {
+       kvm_trace_cleanup();
        misc_deregister(&kvm_dev);
        kmem_cache_destroy(kvm_vcpu_cache);
        sysdev_unregister(&kvm_sysdev);
        sysdev_class_unregister(&kvm_sysdev_class);
        unregister_reboot_notifier(&kvm_reboot_notifier);
        unregister_cpu_notifier(&kvm_cpu_notifier);
-       on_each_cpu(hardware_disable, NULL, 0, 1);
+       on_each_cpu(hardware_disable, NULL, 1);
        kvm_arch_hardware_unsetup();
        kvm_arch_exit();
        kvm_exit_debug();
+       free_cpumask_var(cpus_hardware_enabled);
        __free_page(bad_page);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);