x64, x2apic/intr-remap: MSI and MSI-X support for interrupt remapping infrastructure
authorSuresh Siddha <suresh.b.siddha@intel.com>
Thu, 10 Jul 2008 18:16:57 +0000 (11:16 -0700)
committerIngo Molnar <mingo@elte.hu>
Sat, 12 Jul 2008 06:45:05 +0000 (08:45 +0200)
MSI and MSI-X support for interrupt remapping infrastructure.

MSI address register will be programmed with interrupt-remapping table
entry(IRTE) index and the IRTE will contain information about the vector,
cpu destination, etc.

For MSI-X, all the IRTE's will be consecutively allocated in the table,
and the address registers will contain the starting index to the block
and the data register will contain the subindex with in that block.

This also introduces a new irq_chip for cleaner irq migration (in the process
context as opposed to the current irq migration in the context of an interrupt.
interrupt-remapping infrastructure will help us achieve this).

As MSI is edge triggered, irq migration is a simple atomic update(of vector
and cpu destination) of IRTE and flushing the hardware cache.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: akpm@linux-foundation.org
Cc: arjan@linux.intel.com
Cc: andi@firstfloor.org
Cc: ebiederm@xmission.com
Cc: jbarnes@virtuousgeek.org
Cc: steiner@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
arch/x86/kernel/io_apic_64.c
drivers/pci/intr_remapping.c
include/asm-x86/msidef.h
include/linux/dmar.h

index 9bd02ef..877aa2e 100644 (file)
@@ -2297,6 +2297,9 @@ void destroy_irq(unsigned int irq)
 
        dynamic_irq_cleanup(irq);
 
+#ifdef CONFIG_INTR_REMAP
+       free_irte(irq);
+#endif
        spin_lock_irqsave(&vector_lock, flags);
        __clear_irq_vector(irq);
        spin_unlock_irqrestore(&vector_lock, flags);
@@ -2315,11 +2318,42 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 
        tmp = TARGET_CPUS;
        err = assign_irq_vector(irq, tmp);
-       if (!err) {
-               cpus_and(tmp, cfg->domain, tmp);
-               dest = cpu_mask_to_apicid(tmp);
+       if (err)
+               return err;
+
+       cpus_and(tmp, cfg->domain, tmp);
+       dest = cpu_mask_to_apicid(tmp);
+
+#ifdef CONFIG_INTR_REMAP
+       if (irq_remapped(irq)) {
+               struct irte irte;
+               int ir_index;
+               u16 sub_handle;
+
+               ir_index = map_irq_to_irte_handle(irq, &sub_handle);
+               BUG_ON(ir_index == -1);
+
+               memset (&irte, 0, sizeof(irte));
+
+               irte.present = 1;
+               irte.dst_mode = INT_DEST_MODE;
+               irte.trigger_mode = 0; /* edge */
+               irte.dlvry_mode = INT_DELIVERY_MODE;
+               irte.vector = cfg->vector;
+               irte.dest_id = IRTE_DEST(dest);
+
+               modify_irte(irq, &irte);
 
                msg->address_hi = MSI_ADDR_BASE_HI;
+               msg->data = sub_handle;
+               msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+                                 MSI_ADDR_IR_SHV |
+                                 MSI_ADDR_IR_INDEX1(ir_index) |
+                                 MSI_ADDR_IR_INDEX2(ir_index);
+       } else
+#endif
+       {
+               msg->address_hi = MSI_ADDR_BASE_HI;
                msg->address_lo =
                        MSI_ADDR_BASE_LO |
                        ((INT_DEST_MODE == 0) ?
@@ -2369,6 +2403,55 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
        write_msi_msg(irq, &msg);
        irq_desc[irq].affinity = mask;
 }
+
+#ifdef CONFIG_INTR_REMAP
+/*
+ * Migrate the MSI irq to another cpumask. This migration is
+ * done in the process context using interrupt-remapping hardware.
+ */
+static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+       struct irq_cfg *cfg = irq_cfg + irq;
+       unsigned int dest;
+       cpumask_t tmp, cleanup_mask;
+       struct irte irte;
+
+       cpus_and(tmp, mask, cpu_online_map);
+       if (cpus_empty(tmp))
+               return;
+
+       if (get_irte(irq, &irte))
+               return;
+
+       if (assign_irq_vector(irq, mask))
+               return;
+
+       cpus_and(tmp, cfg->domain, mask);
+       dest = cpu_mask_to_apicid(tmp);
+
+       irte.vector = cfg->vector;
+       irte.dest_id = IRTE_DEST(dest);
+
+       /*
+        * atomically update the IRTE with the new destination and vector.
+        */
+       modify_irte(irq, &irte);
+
+       /*
+        * After this point, all the interrupts will start arriving
+        * at the new destination. So, time to cleanup the previous
+        * vector allocation.
+        */
+       if (cfg->move_in_progress) {
+               cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+               cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+               send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+               cfg->move_in_progress = 0;
+       }
+
+       irq_desc[irq].affinity = mask;
+}
+#endif
 #endif /* CONFIG_SMP */
 
 /*
@@ -2386,26 +2469,157 @@ static struct irq_chip msi_chip = {
        .retrigger      = ioapic_retrigger_irq,
 };
 
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+#ifdef CONFIG_INTR_REMAP
+static struct irq_chip msi_ir_chip = {
+       .name           = "IR-PCI-MSI",
+       .unmask         = unmask_msi_irq,
+       .mask           = mask_msi_irq,
+       .ack            = ack_x2apic_edge,
+#ifdef CONFIG_SMP
+       .set_affinity   = ir_set_msi_irq_affinity,
+#endif
+       .retrigger      = ioapic_retrigger_irq,
+};
+
+/*
+ * Map the PCI dev to the corresponding remapping hardware unit
+ * and allocate 'nvec' consecutive interrupt-remapping table entries
+ * in it.
+ */
+static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
+{
+       struct intel_iommu *iommu;
+       int index;
+
+       iommu = map_dev_to_ir(dev);
+       if (!iommu) {
+               printk(KERN_ERR
+                      "Unable to map PCI %s to iommu\n", pci_name(dev));
+               return -ENOENT;
+       }
+
+       index = alloc_irte(iommu, irq, nvec);
+       if (index < 0) {
+               printk(KERN_ERR
+                      "Unable to allocate %d IRTE for PCI %s\n", nvec,
+                       pci_name(dev));
+               return -ENOSPC;
+       }
+       return index;
+}
+#endif
+
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
 {
+       int ret;
        struct msi_msg msg;
+
+       ret = msi_compose_msg(dev, irq, &msg);
+       if (ret < 0)
+               return ret;
+
+       set_irq_msi(irq, desc);
+       write_msi_msg(irq, &msg);
+
+#ifdef CONFIG_INTR_REMAP
+       if (irq_remapped(irq)) {
+               struct irq_desc *desc = irq_desc + irq;
+               /*
+                * irq migration in process context
+                */
+               desc->status |= IRQ_MOVE_PCNTXT;
+               set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
+       } else
+#endif
+               set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+
+       return 0;
+}
+
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
        int irq, ret;
+
        irq = create_irq();
        if (irq < 0)
                return irq;
 
-       ret = msi_compose_msg(dev, irq, &msg);
+#ifdef CONFIG_INTR_REMAP
+       if (!intr_remapping_enabled)
+               goto no_ir;
+
+       ret = msi_alloc_irte(dev, irq, 1);
+       if (ret < 0)
+               goto error;
+no_ir:
+#endif
+       ret = setup_msi_irq(dev, desc, irq);
        if (ret < 0) {
                destroy_irq(irq);
                return ret;
        }
+       return 0;
 
-       set_irq_msi(irq, desc);
-       write_msi_msg(irq, &msg);
+#ifdef CONFIG_INTR_REMAP
+error:
+       destroy_irq(irq);
+       return ret;
+#endif
+}
 
-       set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+       int irq, ret, sub_handle;
+       struct msi_desc *desc;
+#ifdef CONFIG_INTR_REMAP
+       struct intel_iommu *iommu = 0;
+       int index = 0;
+#endif
 
+       sub_handle = 0;
+       list_for_each_entry(desc, &dev->msi_list, list) {
+               irq = create_irq();
+               if (irq < 0)
+                       return irq;
+#ifdef CONFIG_INTR_REMAP
+               if (!intr_remapping_enabled)
+                       goto no_ir;
+
+               if (!sub_handle) {
+                       /*
+                        * allocate the consecutive block of IRTE's
+                        * for 'nvec'
+                        */
+                       index = msi_alloc_irte(dev, irq, nvec);
+                       if (index < 0) {
+                               ret = index;
+                               goto error;
+                       }
+               } else {
+                       iommu = map_dev_to_ir(dev);
+                       if (!iommu) {
+                               ret = -ENOENT;
+                               goto error;
+                       }
+                       /*
+                        * setup the mapping between the irq and the IRTE
+                        * base index, the sub_handle pointing to the
+                        * appropriate interrupt remap table entry.
+                        */
+                       set_irte_irq(irq, iommu, index, sub_handle);
+               }
+no_ir:
+#endif
+               ret = setup_msi_irq(dev, desc, irq);
+               if (ret < 0)
+                       goto error;
+               sub_handle++;
+       }
        return 0;
+
+error:
+       destroy_irq(irq);
+       return ret;
 }
 
 void arch_teardown_msi_irq(unsigned int irq)
index 32e55c7..bb642cc 100644 (file)
@@ -230,6 +230,17 @@ struct intel_iommu *map_ioapic_to_ir(int apic)
        return NULL;
 }
 
+struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
+{
+       struct dmar_drhd_unit *drhd;
+
+       drhd = dmar_find_matched_drhd_unit(dev);
+       if (!drhd)
+               return NULL;
+
+       return drhd->iommu;
+}
+
 int free_irte(int irq)
 {
        int index, i;
index 296f29c..57fd859 100644 (file)
@@ -48,4 +48,8 @@
 #define  MSI_ADDR_DEST_ID(dest)                (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
                                         MSI_ADDR_DEST_ID_MASK)
 
+#define MSI_ADDR_IR_EXT_INT            (1 << 4)
+#define MSI_ADDR_IR_SHV                        (1 << 3)
+#define MSI_ADDR_IR_INDEX1(index)      ((index & 0x8000) >> 13)
+#define MSI_ADDR_IR_INDEX2(index)      ((index & 0x7fff) << 5)
 #endif /* ASM_MSIDEF_H */
index bf41ffa..c360c55 100644 (file)
@@ -109,6 +109,7 @@ extern int flush_irte(int irq);
 extern int free_irte(int irq);
 
 extern int irq_remapped(int irq);
+extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
 extern struct intel_iommu *map_ioapic_to_ir(int apic);
 #else
 #define irq_remapped(irq)              (0)