2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
42 #define ROOT_SIZE VTD_PAGE_SIZE
43 #define CONTEXT_SIZE VTD_PAGE_SIZE
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48 #define IOAPIC_RANGE_START (0xfee00000)
49 #define IOAPIC_RANGE_END (0xfeefffff)
50 #define IOVA_START_ADDR (0x1000)
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
60 /* global iommu list, set NULL for ignored DMAR units */
61 static struct intel_iommu **g_iommus;
66 * 12-63: Context Ptr (12 - (haw-1))
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74 static inline bool root_present(struct root_entry *root)
76 return (root->val & 1);
78 static inline void set_root_present(struct root_entry *root)
82 static inline void set_root_value(struct root_entry *root, unsigned long value)
84 root->val |= value & VTD_PAGE_MASK;
87 static inline struct context_entry *
88 get_context_addr_from_root(struct root_entry *root)
90 return (struct context_entry *)
91 (root_present(root)?phys_to_virt(
92 root->val & VTD_PAGE_MASK) :
99 * 1: fault processing disable
100 * 2-3: translation type
101 * 12-63: address space root
107 struct context_entry {
112 static inline bool context_present(struct context_entry *context)
114 return (context->lo & 1);
116 static inline void context_set_present(struct context_entry *context)
121 static inline void context_set_fault_enable(struct context_entry *context)
123 context->lo &= (((u64)-1) << 2) | 1;
126 #define CONTEXT_TT_MULTI_LEVEL 0
128 static inline void context_set_translation_type(struct context_entry *context,
131 context->lo &= (((u64)-1) << 4) | 3;
132 context->lo |= (value & 3) << 2;
135 static inline void context_set_address_root(struct context_entry *context,
138 context->lo |= value & VTD_PAGE_MASK;
141 static inline void context_set_address_width(struct context_entry *context,
144 context->hi |= value & 7;
147 static inline void context_set_domain_id(struct context_entry *context,
150 context->hi |= (value & ((1 << 16) - 1)) << 8;
153 static inline void context_clear_entry(struct context_entry *context)
165 * 12-63: Host physcial address
171 static inline void dma_clear_pte(struct dma_pte *pte)
176 static inline void dma_set_pte_readable(struct dma_pte *pte)
178 pte->val |= DMA_PTE_READ;
181 static inline void dma_set_pte_writable(struct dma_pte *pte)
183 pte->val |= DMA_PTE_WRITE;
186 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
188 pte->val = (pte->val & ~3) | (prot & 3);
191 static inline u64 dma_pte_addr(struct dma_pte *pte)
193 return (pte->val & VTD_PAGE_MASK);
196 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
198 pte->val |= (addr & VTD_PAGE_MASK);
201 static inline bool dma_pte_present(struct dma_pte *pte)
203 return (pte->val & 3) != 0;
206 /* devices under the same p2p bridge are owned in one domain */
207 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 < 0)
209 /* domain represents a virtual machine, more than one devices
210 * across iommus may be owned in one domain, e.g. kvm guest.
212 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
215 int id; /* domain id */
216 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
218 struct list_head devices; /* all devices' list */
219 struct iova_domain iovad; /* iova's that belong to this domain */
221 struct dma_pte *pgd; /* virtual address */
222 spinlock_t mapping_lock; /* page table lock */
223 int gaw; /* max guest address width */
225 /* adjusted guest address width, 0 is level 2 30-bit */
228 int flags; /* flags to find out type of domain */
230 int iommu_coherency;/* indicate coherency of iommu access */
231 int iommu_count; /* reference count of iommu */
232 spinlock_t iommu_lock; /* protect iommu set in domain */
235 /* PCI domain-device relationship */
236 struct device_domain_info {
237 struct list_head link; /* link to domain siblings */
238 struct list_head global; /* link to global list */
239 u8 bus; /* PCI bus numer */
240 u8 devfn; /* PCI devfn number */
241 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
242 struct dmar_domain *domain; /* pointer to domain */
245 static void flush_unmaps_timeout(unsigned long data);
247 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
249 #define HIGH_WATER_MARK 250
250 struct deferred_flush_tables {
252 struct iova *iova[HIGH_WATER_MARK];
253 struct dmar_domain *domain[HIGH_WATER_MARK];
256 static struct deferred_flush_tables *deferred_flush;
258 /* bitmap for indexing intel_iommus */
259 static int g_num_of_iommus;
261 static DEFINE_SPINLOCK(async_umap_flush_lock);
262 static LIST_HEAD(unmaps_to_do);
265 static long list_size;
267 static void domain_remove_dev_info(struct dmar_domain *domain);
270 static int __initdata dmar_map_gfx = 1;
271 static int dmar_forcedac;
272 static int intel_iommu_strict;
274 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
275 static DEFINE_SPINLOCK(device_domain_lock);
276 static LIST_HEAD(device_domain_list);
278 static int __init intel_iommu_setup(char *str)
283 if (!strncmp(str, "off", 3)) {
285 printk(KERN_INFO"Intel-IOMMU: disabled\n");
286 } else if (!strncmp(str, "igfx_off", 8)) {
289 "Intel-IOMMU: disable GFX device mapping\n");
290 } else if (!strncmp(str, "forcedac", 8)) {
292 "Intel-IOMMU: Forcing DAC for PCI devices\n");
294 } else if (!strncmp(str, "strict", 6)) {
296 "Intel-IOMMU: disable batched IOTLB flush\n");
297 intel_iommu_strict = 1;
300 str += strcspn(str, ",");
306 __setup("intel_iommu=", intel_iommu_setup);
308 static struct kmem_cache *iommu_domain_cache;
309 static struct kmem_cache *iommu_devinfo_cache;
310 static struct kmem_cache *iommu_iova_cache;
312 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
317 /* trying to avoid low memory issues */
318 flags = current->flags & PF_MEMALLOC;
319 current->flags |= PF_MEMALLOC;
320 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
321 current->flags &= (~PF_MEMALLOC | flags);
326 static inline void *alloc_pgtable_page(void)
331 /* trying to avoid low memory issues */
332 flags = current->flags & PF_MEMALLOC;
333 current->flags |= PF_MEMALLOC;
334 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
335 current->flags &= (~PF_MEMALLOC | flags);
339 static inline void free_pgtable_page(void *vaddr)
341 free_page((unsigned long)vaddr);
344 static inline void *alloc_domain_mem(void)
346 return iommu_kmem_cache_alloc(iommu_domain_cache);
349 static void free_domain_mem(void *vaddr)
351 kmem_cache_free(iommu_domain_cache, vaddr);
354 static inline void * alloc_devinfo_mem(void)
356 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
359 static inline void free_devinfo_mem(void *vaddr)
361 kmem_cache_free(iommu_devinfo_cache, vaddr);
364 struct iova *alloc_iova_mem(void)
366 return iommu_kmem_cache_alloc(iommu_iova_cache);
369 void free_iova_mem(struct iova *iova)
371 kmem_cache_free(iommu_iova_cache, iova);
375 static inline int width_to_agaw(int width);
377 /* calculate agaw for each iommu.
378 * "SAGAW" may be different across iommus, use a default agaw, and
379 * get a supported less agaw for iommus that don't support the default agaw.
381 int iommu_calculate_agaw(struct intel_iommu *iommu)
386 sagaw = cap_sagaw(iommu->cap);
387 for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
389 if (test_bit(agaw, &sagaw))
396 /* in native case, each domain is related to only one iommu */
397 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
401 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
403 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
404 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
407 return g_iommus[iommu_id];
410 /* "Coherency" capability may be different across iommus */
411 static void domain_update_iommu_coherency(struct dmar_domain *domain)
415 domain->iommu_coherency = 1;
417 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
418 for (; i < g_num_of_iommus; ) {
419 if (!ecap_coherent(g_iommus[i]->ecap)) {
420 domain->iommu_coherency = 0;
423 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
427 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
429 struct dmar_drhd_unit *drhd = NULL;
432 for_each_drhd_unit(drhd) {
436 for (i = 0; i < drhd->devices_cnt; i++)
437 if (drhd->devices[i]->bus->number == bus &&
438 drhd->devices[i]->devfn == devfn)
441 if (drhd->include_all)
448 static void domain_flush_cache(struct dmar_domain *domain,
449 void *addr, int size)
451 if (!domain->iommu_coherency)
452 clflush_cache_range(addr, size);
455 /* Gets context entry for a given bus and devfn */
456 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
459 struct root_entry *root;
460 struct context_entry *context;
461 unsigned long phy_addr;
464 spin_lock_irqsave(&iommu->lock, flags);
465 root = &iommu->root_entry[bus];
466 context = get_context_addr_from_root(root);
468 context = (struct context_entry *)alloc_pgtable_page();
470 spin_unlock_irqrestore(&iommu->lock, flags);
473 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
474 phy_addr = virt_to_phys((void *)context);
475 set_root_value(root, phy_addr);
476 set_root_present(root);
477 __iommu_flush_cache(iommu, root, sizeof(*root));
479 spin_unlock_irqrestore(&iommu->lock, flags);
480 return &context[devfn];
483 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
485 struct root_entry *root;
486 struct context_entry *context;
490 spin_lock_irqsave(&iommu->lock, flags);
491 root = &iommu->root_entry[bus];
492 context = get_context_addr_from_root(root);
497 ret = context_present(&context[devfn]);
499 spin_unlock_irqrestore(&iommu->lock, flags);
503 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
505 struct root_entry *root;
506 struct context_entry *context;
509 spin_lock_irqsave(&iommu->lock, flags);
510 root = &iommu->root_entry[bus];
511 context = get_context_addr_from_root(root);
513 context_clear_entry(&context[devfn]);
514 __iommu_flush_cache(iommu, &context[devfn], \
517 spin_unlock_irqrestore(&iommu->lock, flags);
520 static void free_context_table(struct intel_iommu *iommu)
522 struct root_entry *root;
525 struct context_entry *context;
527 spin_lock_irqsave(&iommu->lock, flags);
528 if (!iommu->root_entry) {
531 for (i = 0; i < ROOT_ENTRY_NR; i++) {
532 root = &iommu->root_entry[i];
533 context = get_context_addr_from_root(root);
535 free_pgtable_page(context);
537 free_pgtable_page(iommu->root_entry);
538 iommu->root_entry = NULL;
540 spin_unlock_irqrestore(&iommu->lock, flags);
543 /* page table handling */
544 #define LEVEL_STRIDE (9)
545 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
547 static inline int agaw_to_level(int agaw)
552 static inline int agaw_to_width(int agaw)
554 return 30 + agaw * LEVEL_STRIDE;
558 static inline int width_to_agaw(int width)
560 return (width - 30) / LEVEL_STRIDE;
563 static inline unsigned int level_to_offset_bits(int level)
565 return (12 + (level - 1) * LEVEL_STRIDE);
568 static inline int address_level_offset(u64 addr, int level)
570 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
573 static inline u64 level_mask(int level)
575 return ((u64)-1 << level_to_offset_bits(level));
578 static inline u64 level_size(int level)
580 return ((u64)1 << level_to_offset_bits(level));
583 static inline u64 align_to_level(u64 addr, int level)
585 return ((addr + level_size(level) - 1) & level_mask(level));
588 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
590 int addr_width = agaw_to_width(domain->agaw);
591 struct dma_pte *parent, *pte = NULL;
592 int level = agaw_to_level(domain->agaw);
596 BUG_ON(!domain->pgd);
598 addr &= (((u64)1) << addr_width) - 1;
599 parent = domain->pgd;
601 spin_lock_irqsave(&domain->mapping_lock, flags);
605 offset = address_level_offset(addr, level);
606 pte = &parent[offset];
610 if (!dma_pte_present(pte)) {
611 tmp_page = alloc_pgtable_page();
614 spin_unlock_irqrestore(&domain->mapping_lock,
618 domain_flush_cache(domain, tmp_page, PAGE_SIZE);
619 dma_set_pte_addr(pte, virt_to_phys(tmp_page));
621 * high level table always sets r/w, last level page
622 * table control read/write
624 dma_set_pte_readable(pte);
625 dma_set_pte_writable(pte);
626 domain_flush_cache(domain, pte, sizeof(*pte));
628 parent = phys_to_virt(dma_pte_addr(pte));
632 spin_unlock_irqrestore(&domain->mapping_lock, flags);
636 /* return address's pte at specific level */
637 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
640 struct dma_pte *parent, *pte = NULL;
641 int total = agaw_to_level(domain->agaw);
644 parent = domain->pgd;
645 while (level <= total) {
646 offset = address_level_offset(addr, total);
647 pte = &parent[offset];
651 if (!dma_pte_present(pte))
653 parent = phys_to_virt(dma_pte_addr(pte));
659 /* clear one page's page table */
660 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
662 struct dma_pte *pte = NULL;
664 /* get last level pte */
665 pte = dma_addr_level_pte(domain, addr, 1);
669 domain_flush_cache(domain, pte, sizeof(*pte));
673 /* clear last level pte, a tlb flush should be followed */
674 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
676 int addr_width = agaw_to_width(domain->agaw);
678 start &= (((u64)1) << addr_width) - 1;
679 end &= (((u64)1) << addr_width) - 1;
680 /* in case it's partial page */
681 start = PAGE_ALIGN(start);
684 /* we don't need lock here, nobody else touches the iova range */
685 while (start < end) {
686 dma_pte_clear_one(domain, start);
687 start += VTD_PAGE_SIZE;
691 /* free page table pages. last level pte should already be cleared */
692 static void dma_pte_free_pagetable(struct dmar_domain *domain,
695 int addr_width = agaw_to_width(domain->agaw);
697 int total = agaw_to_level(domain->agaw);
701 start &= (((u64)1) << addr_width) - 1;
702 end &= (((u64)1) << addr_width) - 1;
704 /* we don't need lock here, nobody else touches the iova range */
706 while (level <= total) {
707 tmp = align_to_level(start, level);
708 if (tmp >= end || (tmp + level_size(level) > end))
712 pte = dma_addr_level_pte(domain, tmp, level);
715 phys_to_virt(dma_pte_addr(pte)));
717 domain_flush_cache(domain, pte, sizeof(*pte));
719 tmp += level_size(level);
724 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
725 free_pgtable_page(domain->pgd);
731 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
733 struct root_entry *root;
736 root = (struct root_entry *)alloc_pgtable_page();
740 __iommu_flush_cache(iommu, root, ROOT_SIZE);
742 spin_lock_irqsave(&iommu->lock, flags);
743 iommu->root_entry = root;
744 spin_unlock_irqrestore(&iommu->lock, flags);
749 static void iommu_set_root_entry(struct intel_iommu *iommu)
755 addr = iommu->root_entry;
757 spin_lock_irqsave(&iommu->register_lock, flag);
758 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
760 cmd = iommu->gcmd | DMA_GCMD_SRTP;
761 writel(cmd, iommu->reg + DMAR_GCMD_REG);
763 /* Make sure hardware complete it */
764 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
765 readl, (sts & DMA_GSTS_RTPS), sts);
767 spin_unlock_irqrestore(&iommu->register_lock, flag);
770 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
775 if (!cap_rwbf(iommu->cap))
777 val = iommu->gcmd | DMA_GCMD_WBF;
779 spin_lock_irqsave(&iommu->register_lock, flag);
780 writel(val, iommu->reg + DMAR_GCMD_REG);
782 /* Make sure hardware complete it */
783 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
784 readl, (!(val & DMA_GSTS_WBFS)), val);
786 spin_unlock_irqrestore(&iommu->register_lock, flag);
789 /* return value determine if we need a write buffer flush */
790 static int __iommu_flush_context(struct intel_iommu *iommu,
791 u16 did, u16 source_id, u8 function_mask, u64 type,
792 int non_present_entry_flush)
798 * In the non-present entry flush case, if hardware doesn't cache
799 * non-present entry we do nothing and if hardware cache non-present
800 * entry, we flush entries of domain 0 (the domain id is used to cache
801 * any non-present entries)
803 if (non_present_entry_flush) {
804 if (!cap_caching_mode(iommu->cap))
811 case DMA_CCMD_GLOBAL_INVL:
812 val = DMA_CCMD_GLOBAL_INVL;
814 case DMA_CCMD_DOMAIN_INVL:
815 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
817 case DMA_CCMD_DEVICE_INVL:
818 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
819 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
826 spin_lock_irqsave(&iommu->register_lock, flag);
827 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
829 /* Make sure hardware complete it */
830 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
831 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
833 spin_unlock_irqrestore(&iommu->register_lock, flag);
835 /* flush context entry will implicitly flush write buffer */
839 /* return value determine if we need a write buffer flush */
840 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
841 u64 addr, unsigned int size_order, u64 type,
842 int non_present_entry_flush)
844 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
845 u64 val = 0, val_iva = 0;
849 * In the non-present entry flush case, if hardware doesn't cache
850 * non-present entry we do nothing and if hardware cache non-present
851 * entry, we flush entries of domain 0 (the domain id is used to cache
852 * any non-present entries)
854 if (non_present_entry_flush) {
855 if (!cap_caching_mode(iommu->cap))
862 case DMA_TLB_GLOBAL_FLUSH:
863 /* global flush doesn't need set IVA_REG */
864 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
866 case DMA_TLB_DSI_FLUSH:
867 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
869 case DMA_TLB_PSI_FLUSH:
870 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
871 /* Note: always flush non-leaf currently */
872 val_iva = size_order | addr;
877 /* Note: set drain read/write */
880 * This is probably to be super secure.. Looks like we can
881 * ignore it without any impact.
883 if (cap_read_drain(iommu->cap))
884 val |= DMA_TLB_READ_DRAIN;
886 if (cap_write_drain(iommu->cap))
887 val |= DMA_TLB_WRITE_DRAIN;
889 spin_lock_irqsave(&iommu->register_lock, flag);
890 /* Note: Only uses first TLB reg currently */
892 dmar_writeq(iommu->reg + tlb_offset, val_iva);
893 dmar_writeq(iommu->reg + tlb_offset + 8, val);
895 /* Make sure hardware complete it */
896 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
897 dmar_readq, (!(val & DMA_TLB_IVT)), val);
899 spin_unlock_irqrestore(&iommu->register_lock, flag);
901 /* check IOTLB invalidation granularity */
902 if (DMA_TLB_IAIG(val) == 0)
903 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
904 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
905 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
906 (unsigned long long)DMA_TLB_IIRG(type),
907 (unsigned long long)DMA_TLB_IAIG(val));
908 /* flush iotlb entry will implicitly flush write buffer */
912 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
913 u64 addr, unsigned int pages, int non_present_entry_flush)
917 BUG_ON(addr & (~VTD_PAGE_MASK));
920 /* Fallback to domain selective flush if no PSI support */
921 if (!cap_pgsel_inv(iommu->cap))
922 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
924 non_present_entry_flush);
927 * PSI requires page size to be 2 ^ x, and the base address is naturally
928 * aligned to the size
930 mask = ilog2(__roundup_pow_of_two(pages));
931 /* Fallback to domain selective flush if size is too big */
932 if (mask > cap_max_amask_val(iommu->cap))
933 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
934 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
936 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
938 non_present_entry_flush);
941 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
946 spin_lock_irqsave(&iommu->register_lock, flags);
947 pmen = readl(iommu->reg + DMAR_PMEN_REG);
948 pmen &= ~DMA_PMEN_EPM;
949 writel(pmen, iommu->reg + DMAR_PMEN_REG);
951 /* wait for the protected region status bit to clear */
952 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
953 readl, !(pmen & DMA_PMEN_PRS), pmen);
955 spin_unlock_irqrestore(&iommu->register_lock, flags);
958 static int iommu_enable_translation(struct intel_iommu *iommu)
963 spin_lock_irqsave(&iommu->register_lock, flags);
964 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
966 /* Make sure hardware complete it */
967 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
968 readl, (sts & DMA_GSTS_TES), sts);
970 iommu->gcmd |= DMA_GCMD_TE;
971 spin_unlock_irqrestore(&iommu->register_lock, flags);
975 static int iommu_disable_translation(struct intel_iommu *iommu)
980 spin_lock_irqsave(&iommu->register_lock, flag);
981 iommu->gcmd &= ~DMA_GCMD_TE;
982 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
984 /* Make sure hardware complete it */
985 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
986 readl, (!(sts & DMA_GSTS_TES)), sts);
988 spin_unlock_irqrestore(&iommu->register_lock, flag);
992 /* iommu interrupt handling. Most stuff are MSI-like. */
994 static const char *fault_reason_strings[] =
997 "Present bit in root entry is clear",
998 "Present bit in context entry is clear",
999 "Invalid context entry",
1000 "Access beyond MGAW",
1001 "PTE Write access is not set",
1002 "PTE Read access is not set",
1003 "Next page table ptr is invalid",
1004 "Root table address invalid",
1005 "Context table ptr is invalid",
1006 "non-zero reserved fields in RTP",
1007 "non-zero reserved fields in CTP",
1008 "non-zero reserved fields in PTE",
1010 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
1012 const char *dmar_get_fault_reason(u8 fault_reason)
1014 if (fault_reason > MAX_FAULT_REASON_IDX)
1017 return fault_reason_strings[fault_reason];
1020 void dmar_msi_unmask(unsigned int irq)
1022 struct intel_iommu *iommu = get_irq_data(irq);
1026 spin_lock_irqsave(&iommu->register_lock, flag);
1027 writel(0, iommu->reg + DMAR_FECTL_REG);
1028 /* Read a reg to force flush the post write */
1029 readl(iommu->reg + DMAR_FECTL_REG);
1030 spin_unlock_irqrestore(&iommu->register_lock, flag);
1033 void dmar_msi_mask(unsigned int irq)
1036 struct intel_iommu *iommu = get_irq_data(irq);
1039 spin_lock_irqsave(&iommu->register_lock, flag);
1040 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1041 /* Read a reg to force flush the post write */
1042 readl(iommu->reg + DMAR_FECTL_REG);
1043 spin_unlock_irqrestore(&iommu->register_lock, flag);
1046 void dmar_msi_write(int irq, struct msi_msg *msg)
1048 struct intel_iommu *iommu = get_irq_data(irq);
1051 spin_lock_irqsave(&iommu->register_lock, flag);
1052 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1053 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1054 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1055 spin_unlock_irqrestore(&iommu->register_lock, flag);
1058 void dmar_msi_read(int irq, struct msi_msg *msg)
1060 struct intel_iommu *iommu = get_irq_data(irq);
1063 spin_lock_irqsave(&iommu->register_lock, flag);
1064 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1065 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1066 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1067 spin_unlock_irqrestore(&iommu->register_lock, flag);
1070 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1071 u8 fault_reason, u16 source_id, unsigned long long addr)
1075 reason = dmar_get_fault_reason(fault_reason);
1078 "DMAR:[%s] Request device [%02x:%02x.%d] "
1079 "fault addr %llx \n"
1080 "DMAR:[fault reason %02d] %s\n",
1081 (type ? "DMA Read" : "DMA Write"),
1082 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1083 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1087 #define PRIMARY_FAULT_REG_LEN (16)
1088 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1090 struct intel_iommu *iommu = dev_id;
1091 int reg, fault_index;
1095 spin_lock_irqsave(&iommu->register_lock, flag);
1096 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1098 /* TBD: ignore advanced fault log currently */
1099 if (!(fault_status & DMA_FSTS_PPF))
1100 goto clear_overflow;
1102 fault_index = dma_fsts_fault_record_index(fault_status);
1103 reg = cap_fault_reg_offset(iommu->cap);
1111 /* highest 32 bits */
1112 data = readl(iommu->reg + reg +
1113 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1114 if (!(data & DMA_FRCD_F))
1117 fault_reason = dma_frcd_fault_reason(data);
1118 type = dma_frcd_type(data);
1120 data = readl(iommu->reg + reg +
1121 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1122 source_id = dma_frcd_source_id(data);
1124 guest_addr = dmar_readq(iommu->reg + reg +
1125 fault_index * PRIMARY_FAULT_REG_LEN);
1126 guest_addr = dma_frcd_page_addr(guest_addr);
1127 /* clear the fault */
1128 writel(DMA_FRCD_F, iommu->reg + reg +
1129 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1131 spin_unlock_irqrestore(&iommu->register_lock, flag);
1133 iommu_page_fault_do_one(iommu, type, fault_reason,
1134 source_id, guest_addr);
1137 if (fault_index > cap_num_fault_regs(iommu->cap))
1139 spin_lock_irqsave(&iommu->register_lock, flag);
1142 /* clear primary fault overflow */
1143 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1144 if (fault_status & DMA_FSTS_PFO)
1145 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1147 spin_unlock_irqrestore(&iommu->register_lock, flag);
1151 int dmar_set_interrupt(struct intel_iommu *iommu)
1157 printk(KERN_ERR "IOMMU: no free vectors\n");
1161 set_irq_data(irq, iommu);
1164 ret = arch_setup_dmar_msi(irq);
1166 set_irq_data(irq, NULL);
1172 /* Force fault register is cleared */
1173 iommu_page_fault(irq, iommu);
1175 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1177 printk(KERN_ERR "IOMMU: can't request irq\n");
1181 static int iommu_init_domains(struct intel_iommu *iommu)
1183 unsigned long ndomains;
1184 unsigned long nlongs;
1186 ndomains = cap_ndoms(iommu->cap);
1187 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1188 nlongs = BITS_TO_LONGS(ndomains);
1190 /* TBD: there might be 64K domains,
1191 * consider other allocation for future chip
1193 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1194 if (!iommu->domain_ids) {
1195 printk(KERN_ERR "Allocating domain id array failed\n");
1198 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1200 if (!iommu->domains) {
1201 printk(KERN_ERR "Allocating domain array failed\n");
1202 kfree(iommu->domain_ids);
1206 spin_lock_init(&iommu->lock);
1209 * if Caching mode is set, then invalid translations are tagged
1210 * with domainid 0. Hence we need to pre-allocate it.
1212 if (cap_caching_mode(iommu->cap))
1213 set_bit(0, iommu->domain_ids);
1218 static void domain_exit(struct dmar_domain *domain);
1219 static void vm_domain_exit(struct dmar_domain *domain);
1221 void free_dmar_iommu(struct intel_iommu *iommu)
1223 struct dmar_domain *domain;
1225 unsigned long flags;
1227 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1228 for (; i < cap_ndoms(iommu->cap); ) {
1229 domain = iommu->domains[i];
1230 clear_bit(i, iommu->domain_ids);
1232 spin_lock_irqsave(&domain->iommu_lock, flags);
1233 if (--domain->iommu_count == 0) {
1234 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1235 vm_domain_exit(domain);
1237 domain_exit(domain);
1239 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1241 i = find_next_bit(iommu->domain_ids,
1242 cap_ndoms(iommu->cap), i+1);
1245 if (iommu->gcmd & DMA_GCMD_TE)
1246 iommu_disable_translation(iommu);
1249 set_irq_data(iommu->irq, NULL);
1250 /* This will mask the irq */
1251 free_irq(iommu->irq, iommu);
1252 destroy_irq(iommu->irq);
1255 kfree(iommu->domains);
1256 kfree(iommu->domain_ids);
1258 g_iommus[iommu->seq_id] = NULL;
1260 /* if all iommus are freed, free g_iommus */
1261 for (i = 0; i < g_num_of_iommus; i++) {
1266 if (i == g_num_of_iommus)
1269 /* free context mapping */
1270 free_context_table(iommu);
1273 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1276 unsigned long ndomains;
1277 struct dmar_domain *domain;
1278 unsigned long flags;
1280 domain = alloc_domain_mem();
1284 ndomains = cap_ndoms(iommu->cap);
1286 spin_lock_irqsave(&iommu->lock, flags);
1287 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1288 if (num >= ndomains) {
1289 spin_unlock_irqrestore(&iommu->lock, flags);
1290 free_domain_mem(domain);
1291 printk(KERN_ERR "IOMMU: no free domain ids\n");
1295 set_bit(num, iommu->domain_ids);
1297 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1298 set_bit(iommu->seq_id, &domain->iommu_bmp);
1300 iommu->domains[num] = domain;
1301 spin_unlock_irqrestore(&iommu->lock, flags);
1306 static void iommu_free_domain(struct dmar_domain *domain)
1308 unsigned long flags;
1309 struct intel_iommu *iommu;
1311 iommu = domain_get_iommu(domain);
1313 spin_lock_irqsave(&iommu->lock, flags);
1314 clear_bit(domain->id, iommu->domain_ids);
1315 spin_unlock_irqrestore(&iommu->lock, flags);
1318 static struct iova_domain reserved_iova_list;
1319 static struct lock_class_key reserved_alloc_key;
1320 static struct lock_class_key reserved_rbtree_key;
1322 static void dmar_init_reserved_ranges(void)
1324 struct pci_dev *pdev = NULL;
1329 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1331 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1332 &reserved_alloc_key);
1333 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1334 &reserved_rbtree_key);
1336 /* IOAPIC ranges shouldn't be accessed by DMA */
1337 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1338 IOVA_PFN(IOAPIC_RANGE_END));
1340 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1342 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1343 for_each_pci_dev(pdev) {
1346 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1347 r = &pdev->resource[i];
1348 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1352 size = r->end - addr;
1353 size = PAGE_ALIGN(size);
1354 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1355 IOVA_PFN(size + addr) - 1);
1357 printk(KERN_ERR "Reserve iova failed\n");
1363 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1365 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1368 static inline int guestwidth_to_adjustwidth(int gaw)
1371 int r = (gaw - 12) % 9;
1382 static int domain_init(struct dmar_domain *domain, int guest_width)
1384 struct intel_iommu *iommu;
1385 int adjust_width, agaw;
1386 unsigned long sagaw;
1388 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1389 spin_lock_init(&domain->mapping_lock);
1390 spin_lock_init(&domain->iommu_lock);
1392 domain_reserve_special_ranges(domain);
1394 /* calculate AGAW */
1395 iommu = domain_get_iommu(domain);
1396 if (guest_width > cap_mgaw(iommu->cap))
1397 guest_width = cap_mgaw(iommu->cap);
1398 domain->gaw = guest_width;
1399 adjust_width = guestwidth_to_adjustwidth(guest_width);
1400 agaw = width_to_agaw(adjust_width);
1401 sagaw = cap_sagaw(iommu->cap);
1402 if (!test_bit(agaw, &sagaw)) {
1403 /* hardware doesn't support it, choose a bigger one */
1404 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1405 agaw = find_next_bit(&sagaw, 5, agaw);
1409 domain->agaw = agaw;
1410 INIT_LIST_HEAD(&domain->devices);
1412 if (ecap_coherent(iommu->ecap))
1413 domain->iommu_coherency = 1;
1415 domain->iommu_coherency = 0;
1417 domain->iommu_count = 1;
1419 /* always allocate the top pgd */
1420 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1423 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1427 static void domain_exit(struct dmar_domain *domain)
1431 /* Domain 0 is reserved, so dont process it */
1435 domain_remove_dev_info(domain);
1437 put_iova_domain(&domain->iovad);
1438 end = DOMAIN_MAX_ADDR(domain->gaw);
1439 end = end & (~PAGE_MASK);
1442 dma_pte_clear_range(domain, 0, end);
1444 /* free page tables */
1445 dma_pte_free_pagetable(domain, 0, end);
1447 iommu_free_domain(domain);
1448 free_domain_mem(domain);
1451 static int domain_context_mapping_one(struct dmar_domain *domain,
1454 struct context_entry *context;
1455 unsigned long flags;
1456 struct intel_iommu *iommu;
1457 struct dma_pte *pgd;
1459 unsigned long ndomains;
1463 pr_debug("Set context mapping for %02x:%02x.%d\n",
1464 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1465 BUG_ON(!domain->pgd);
1467 iommu = device_to_iommu(bus, devfn);
1471 context = device_to_context_entry(iommu, bus, devfn);
1474 spin_lock_irqsave(&iommu->lock, flags);
1475 if (context_present(context)) {
1476 spin_unlock_irqrestore(&iommu->lock, flags);
1483 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1486 /* find an available domain id for this device in iommu */
1487 ndomains = cap_ndoms(iommu->cap);
1488 num = find_first_bit(iommu->domain_ids, ndomains);
1489 for (; num < ndomains; ) {
1490 if (iommu->domains[num] == domain) {
1495 num = find_next_bit(iommu->domain_ids,
1496 cap_ndoms(iommu->cap), num+1);
1500 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1501 if (num >= ndomains) {
1502 spin_unlock_irqrestore(&iommu->lock, flags);
1503 printk(KERN_ERR "IOMMU: no free domain ids\n");
1507 set_bit(num, iommu->domain_ids);
1508 iommu->domains[num] = domain;
1512 /* Skip top levels of page tables for
1513 * iommu which has less agaw than default.
1515 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1516 pgd = phys_to_virt(dma_pte_addr(pgd));
1517 if (!dma_pte_present(pgd)) {
1518 spin_unlock_irqrestore(&iommu->lock, flags);
1524 context_set_domain_id(context, id);
1525 context_set_address_width(context, iommu->agaw);
1526 context_set_address_root(context, virt_to_phys(pgd));
1527 context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1528 context_set_fault_enable(context);
1529 context_set_present(context);
1530 domain_flush_cache(domain, context, sizeof(*context));
1532 /* it's a non-present to present mapping */
1533 if (iommu->flush.flush_context(iommu, domain->id,
1534 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1535 DMA_CCMD_DEVICE_INVL, 1))
1536 iommu_flush_write_buffer(iommu);
1538 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1540 spin_unlock_irqrestore(&iommu->lock, flags);
1542 spin_lock_irqsave(&domain->iommu_lock, flags);
1543 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1544 domain->iommu_count++;
1545 domain_update_iommu_coherency(domain);
1547 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1552 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1555 struct pci_dev *tmp, *parent;
1557 ret = domain_context_mapping_one(domain, pdev->bus->number,
1562 /* dependent device mapping */
1563 tmp = pci_find_upstream_pcie_bridge(pdev);
1566 /* Secondary interface's bus number and devfn 0 */
1567 parent = pdev->bus->self;
1568 while (parent != tmp) {
1569 ret = domain_context_mapping_one(domain, parent->bus->number,
1573 parent = parent->bus->self;
1575 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1576 return domain_context_mapping_one(domain,
1577 tmp->subordinate->number, 0);
1578 else /* this is a legacy PCI bridge */
1579 return domain_context_mapping_one(domain,
1580 tmp->bus->number, tmp->devfn);
1583 static int domain_context_mapped(struct pci_dev *pdev)
1586 struct pci_dev *tmp, *parent;
1587 struct intel_iommu *iommu;
1589 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1593 ret = device_context_mapped(iommu,
1594 pdev->bus->number, pdev->devfn);
1597 /* dependent device mapping */
1598 tmp = pci_find_upstream_pcie_bridge(pdev);
1601 /* Secondary interface's bus number and devfn 0 */
1602 parent = pdev->bus->self;
1603 while (parent != tmp) {
1604 ret = device_context_mapped(iommu, parent->bus->number,
1608 parent = parent->bus->self;
1611 return device_context_mapped(iommu,
1612 tmp->subordinate->number, 0);
1614 return device_context_mapped(iommu,
1615 tmp->bus->number, tmp->devfn);
1619 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1620 u64 hpa, size_t size, int prot)
1622 u64 start_pfn, end_pfn;
1623 struct dma_pte *pte;
1625 int addr_width = agaw_to_width(domain->agaw);
1627 hpa &= (((u64)1) << addr_width) - 1;
1629 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1632 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1633 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1635 while (start_pfn < end_pfn) {
1636 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1639 /* We don't need lock here, nobody else
1640 * touches the iova range
1642 BUG_ON(dma_pte_addr(pte));
1643 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1644 dma_set_pte_prot(pte, prot);
1645 domain_flush_cache(domain, pte, sizeof(*pte));
1652 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1657 clear_context_table(iommu, bus, devfn);
1658 iommu->flush.flush_context(iommu, 0, 0, 0,
1659 DMA_CCMD_GLOBAL_INVL, 0);
1660 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1661 DMA_TLB_GLOBAL_FLUSH, 0);
1664 static void domain_remove_dev_info(struct dmar_domain *domain)
1666 struct device_domain_info *info;
1667 unsigned long flags;
1668 struct intel_iommu *iommu;
1670 spin_lock_irqsave(&device_domain_lock, flags);
1671 while (!list_empty(&domain->devices)) {
1672 info = list_entry(domain->devices.next,
1673 struct device_domain_info, link);
1674 list_del(&info->link);
1675 list_del(&info->global);
1677 info->dev->dev.archdata.iommu = NULL;
1678 spin_unlock_irqrestore(&device_domain_lock, flags);
1680 iommu = device_to_iommu(info->bus, info->devfn);
1681 iommu_detach_dev(iommu, info->bus, info->devfn);
1682 free_devinfo_mem(info);
1684 spin_lock_irqsave(&device_domain_lock, flags);
1686 spin_unlock_irqrestore(&device_domain_lock, flags);
1691 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1693 static struct dmar_domain *
1694 find_domain(struct pci_dev *pdev)
1696 struct device_domain_info *info;
1698 /* No lock here, assumes no domain exit in normal case */
1699 info = pdev->dev.archdata.iommu;
1701 return info->domain;
1705 /* domain is initialized */
1706 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1708 struct dmar_domain *domain, *found = NULL;
1709 struct intel_iommu *iommu;
1710 struct dmar_drhd_unit *drhd;
1711 struct device_domain_info *info, *tmp;
1712 struct pci_dev *dev_tmp;
1713 unsigned long flags;
1714 int bus = 0, devfn = 0;
1716 domain = find_domain(pdev);
1720 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1722 if (dev_tmp->is_pcie) {
1723 bus = dev_tmp->subordinate->number;
1726 bus = dev_tmp->bus->number;
1727 devfn = dev_tmp->devfn;
1729 spin_lock_irqsave(&device_domain_lock, flags);
1730 list_for_each_entry(info, &device_domain_list, global) {
1731 if (info->bus == bus && info->devfn == devfn) {
1732 found = info->domain;
1736 spin_unlock_irqrestore(&device_domain_lock, flags);
1737 /* pcie-pci bridge already has a domain, uses it */
1744 /* Allocate new domain for the device */
1745 drhd = dmar_find_matched_drhd_unit(pdev);
1747 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1751 iommu = drhd->iommu;
1753 domain = iommu_alloc_domain(iommu);
1757 if (domain_init(domain, gaw)) {
1758 domain_exit(domain);
1762 /* register pcie-to-pci device */
1764 info = alloc_devinfo_mem();
1766 domain_exit(domain);
1770 info->devfn = devfn;
1772 info->domain = domain;
1773 /* This domain is shared by devices under p2p bridge */
1774 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1776 /* pcie-to-pci bridge already has a domain, uses it */
1778 spin_lock_irqsave(&device_domain_lock, flags);
1779 list_for_each_entry(tmp, &device_domain_list, global) {
1780 if (tmp->bus == bus && tmp->devfn == devfn) {
1781 found = tmp->domain;
1786 free_devinfo_mem(info);
1787 domain_exit(domain);
1790 list_add(&info->link, &domain->devices);
1791 list_add(&info->global, &device_domain_list);
1793 spin_unlock_irqrestore(&device_domain_lock, flags);
1797 info = alloc_devinfo_mem();
1800 info->bus = pdev->bus->number;
1801 info->devfn = pdev->devfn;
1803 info->domain = domain;
1804 spin_lock_irqsave(&device_domain_lock, flags);
1805 /* somebody is fast */
1806 found = find_domain(pdev);
1807 if (found != NULL) {
1808 spin_unlock_irqrestore(&device_domain_lock, flags);
1809 if (found != domain) {
1810 domain_exit(domain);
1813 free_devinfo_mem(info);
1816 list_add(&info->link, &domain->devices);
1817 list_add(&info->global, &device_domain_list);
1818 pdev->dev.archdata.iommu = info;
1819 spin_unlock_irqrestore(&device_domain_lock, flags);
1822 /* recheck it here, maybe others set it */
1823 return find_domain(pdev);
1826 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1827 unsigned long long start,
1828 unsigned long long end)
1830 struct dmar_domain *domain;
1832 unsigned long long base;
1836 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1837 pci_name(pdev), start, end);
1838 /* page table init */
1839 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1843 /* The address might not be aligned */
1844 base = start & PAGE_MASK;
1846 size = PAGE_ALIGN(size);
1847 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1848 IOVA_PFN(base + size) - 1)) {
1849 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1854 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1855 size, base, pci_name(pdev));
1857 * RMRR range might have overlap with physical memory range,
1860 dma_pte_clear_range(domain, base, base + size);
1862 ret = domain_page_mapping(domain, base, base, size,
1863 DMA_PTE_READ|DMA_PTE_WRITE);
1867 /* context entry init */
1868 ret = domain_context_mapping(domain, pdev);
1872 domain_exit(domain);
1877 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1878 struct pci_dev *pdev)
1880 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1882 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1883 rmrr->end_address + 1);
1886 #ifdef CONFIG_DMAR_GFX_WA
1887 struct iommu_prepare_data {
1888 struct pci_dev *pdev;
1892 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1893 unsigned long end_pfn, void *datax)
1895 struct iommu_prepare_data *data;
1897 data = (struct iommu_prepare_data *)datax;
1899 data->ret = iommu_prepare_identity_map(data->pdev,
1900 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1905 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1908 struct iommu_prepare_data data;
1913 for_each_online_node(nid) {
1914 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1921 static void __init iommu_prepare_gfx_mapping(void)
1923 struct pci_dev *pdev = NULL;
1926 for_each_pci_dev(pdev) {
1927 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1928 !IS_GFX_DEVICE(pdev))
1930 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1932 ret = iommu_prepare_with_active_regions(pdev);
1934 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1937 #else /* !CONFIG_DMAR_GFX_WA */
1938 static inline void iommu_prepare_gfx_mapping(void)
1944 #ifdef CONFIG_DMAR_FLOPPY_WA
1945 static inline void iommu_prepare_isa(void)
1947 struct pci_dev *pdev;
1950 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1954 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1955 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1958 printk("IOMMU: Failed to create 0-64M identity map, "
1959 "floppy might not work\n");
1963 static inline void iommu_prepare_isa(void)
1967 #endif /* !CONFIG_DMAR_FLPY_WA */
1969 static int __init init_dmars(void)
1971 struct dmar_drhd_unit *drhd;
1972 struct dmar_rmrr_unit *rmrr;
1973 struct pci_dev *pdev;
1974 struct intel_iommu *iommu;
1975 int i, ret, unit = 0;
1980 * initialize and program root entry to not present
1983 for_each_drhd_unit(drhd) {
1986 * lock not needed as this is only incremented in the single
1987 * threaded kernel __init code path all other access are read
1992 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1995 printk(KERN_ERR "Allocating global iommu array failed\n");
2000 deferred_flush = kzalloc(g_num_of_iommus *
2001 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2002 if (!deferred_flush) {
2008 for_each_drhd_unit(drhd) {
2012 iommu = drhd->iommu;
2013 g_iommus[iommu->seq_id] = iommu;
2015 ret = iommu_init_domains(iommu);
2021 * we could share the same root & context tables
2022 * amoung all IOMMU's. Need to Split it later.
2024 ret = iommu_alloc_root_entry(iommu);
2026 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2031 for_each_drhd_unit(drhd) {
2035 iommu = drhd->iommu;
2036 if (dmar_enable_qi(iommu)) {
2038 * Queued Invalidate not enabled, use Register Based
2041 iommu->flush.flush_context = __iommu_flush_context;
2042 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2043 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2045 (unsigned long long)drhd->reg_base_addr);
2047 iommu->flush.flush_context = qi_flush_context;
2048 iommu->flush.flush_iotlb = qi_flush_iotlb;
2049 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2051 (unsigned long long)drhd->reg_base_addr);
2057 * for each dev attached to rmrr
2059 * locate drhd for dev, alloc domain for dev
2060 * allocate free domain
2061 * allocate page table entries for rmrr
2062 * if context not allocated for bus
2063 * allocate and init context
2064 * set present in root table for this bus
2065 * init context with domain, translation etc
2069 for_each_rmrr_units(rmrr) {
2070 for (i = 0; i < rmrr->devices_cnt; i++) {
2071 pdev = rmrr->devices[i];
2072 /* some BIOS lists non-exist devices in DMAR table */
2075 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2078 "IOMMU: mapping reserved region failed\n");
2082 iommu_prepare_gfx_mapping();
2084 iommu_prepare_isa();
2089 * global invalidate context cache
2090 * global invalidate iotlb
2091 * enable translation
2093 for_each_drhd_unit(drhd) {
2096 iommu = drhd->iommu;
2097 sprintf (iommu->name, "dmar%d", unit++);
2099 iommu_flush_write_buffer(iommu);
2101 ret = dmar_set_interrupt(iommu);
2105 iommu_set_root_entry(iommu);
2107 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2109 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2111 iommu_disable_protect_mem_regions(iommu);
2113 ret = iommu_enable_translation(iommu);
2120 for_each_drhd_unit(drhd) {
2123 iommu = drhd->iommu;
2130 static inline u64 aligned_size(u64 host_addr, size_t size)
2133 addr = (host_addr & (~PAGE_MASK)) + size;
2134 return PAGE_ALIGN(addr);
2138 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2142 /* Make sure it's in range */
2143 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2144 if (!size || (IOVA_START_ADDR + size > end))
2147 piova = alloc_iova(&domain->iovad,
2148 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2152 static struct iova *
2153 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2154 size_t size, u64 dma_mask)
2156 struct pci_dev *pdev = to_pci_dev(dev);
2157 struct iova *iova = NULL;
2159 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2160 iova = iommu_alloc_iova(domain, size, dma_mask);
2163 * First try to allocate an io virtual address in
2164 * DMA_32BIT_MASK and if that fails then try allocating
2167 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2169 iova = iommu_alloc_iova(domain, size, dma_mask);
2173 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2180 static struct dmar_domain *
2181 get_valid_domain_for_dev(struct pci_dev *pdev)
2183 struct dmar_domain *domain;
2186 domain = get_domain_for_dev(pdev,
2187 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2190 "Allocating domain for %s failed", pci_name(pdev));
2194 /* make sure context mapping is ok */
2195 if (unlikely(!domain_context_mapped(pdev))) {
2196 ret = domain_context_mapping(domain, pdev);
2199 "Domain context map for %s failed",
2208 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2209 size_t size, int dir, u64 dma_mask)
2211 struct pci_dev *pdev = to_pci_dev(hwdev);
2212 struct dmar_domain *domain;
2213 phys_addr_t start_paddr;
2217 struct intel_iommu *iommu;
2219 BUG_ON(dir == DMA_NONE);
2220 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2223 domain = get_valid_domain_for_dev(pdev);
2227 iommu = domain_get_iommu(domain);
2228 size = aligned_size((u64)paddr, size);
2230 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2234 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2237 * Check if DMAR supports zero-length reads on write only
2240 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2241 !cap_zlr(iommu->cap))
2242 prot |= DMA_PTE_READ;
2243 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2244 prot |= DMA_PTE_WRITE;
2246 * paddr - (paddr + size) might be partial page, we should map the whole
2247 * page. Note: if two part of one page are separately mapped, we
2248 * might have two guest_addr mapping to the same host paddr, but this
2249 * is not a big problem
2251 ret = domain_page_mapping(domain, start_paddr,
2252 ((u64)paddr) & PAGE_MASK, size, prot);
2256 /* it's a non-present to present mapping */
2257 ret = iommu_flush_iotlb_psi(iommu, domain->id,
2258 start_paddr, size >> VTD_PAGE_SHIFT, 1);
2260 iommu_flush_write_buffer(iommu);
2262 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2266 __free_iova(&domain->iovad, iova);
2267 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2268 pci_name(pdev), size, (unsigned long long)paddr, dir);
2272 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2273 size_t size, int dir)
2275 return __intel_map_single(hwdev, paddr, size, dir,
2276 to_pci_dev(hwdev)->dma_mask);
2279 static void flush_unmaps(void)
2285 /* just flush them all */
2286 for (i = 0; i < g_num_of_iommus; i++) {
2287 struct intel_iommu *iommu = g_iommus[i];
2291 if (deferred_flush[i].next) {
2292 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2293 DMA_TLB_GLOBAL_FLUSH, 0);
2294 for (j = 0; j < deferred_flush[i].next; j++) {
2295 __free_iova(&deferred_flush[i].domain[j]->iovad,
2296 deferred_flush[i].iova[j]);
2298 deferred_flush[i].next = 0;
2305 static void flush_unmaps_timeout(unsigned long data)
2307 unsigned long flags;
2309 spin_lock_irqsave(&async_umap_flush_lock, flags);
2311 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2314 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2316 unsigned long flags;
2318 struct intel_iommu *iommu;
2320 spin_lock_irqsave(&async_umap_flush_lock, flags);
2321 if (list_size == HIGH_WATER_MARK)
2324 iommu = domain_get_iommu(dom);
2325 iommu_id = iommu->seq_id;
2327 next = deferred_flush[iommu_id].next;
2328 deferred_flush[iommu_id].domain[next] = dom;
2329 deferred_flush[iommu_id].iova[next] = iova;
2330 deferred_flush[iommu_id].next++;
2333 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2337 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2340 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2343 struct pci_dev *pdev = to_pci_dev(dev);
2344 struct dmar_domain *domain;
2345 unsigned long start_addr;
2347 struct intel_iommu *iommu;
2349 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2351 domain = find_domain(pdev);
2354 iommu = domain_get_iommu(domain);
2356 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2360 start_addr = iova->pfn_lo << PAGE_SHIFT;
2361 size = aligned_size((u64)dev_addr, size);
2363 pr_debug("Device %s unmapping: %lx@%llx\n",
2364 pci_name(pdev), size, (unsigned long long)start_addr);
2366 /* clear the whole page */
2367 dma_pte_clear_range(domain, start_addr, start_addr + size);
2368 /* free page tables */
2369 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2370 if (intel_iommu_strict) {
2371 if (iommu_flush_iotlb_psi(iommu,
2372 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2373 iommu_flush_write_buffer(iommu);
2375 __free_iova(&domain->iovad, iova);
2377 add_unmap(domain, iova);
2379 * queue up the release of the unmap to save the 1/6th of the
2380 * cpu used up by the iotlb flush operation...
2385 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2386 dma_addr_t *dma_handle, gfp_t flags)
2391 size = PAGE_ALIGN(size);
2392 order = get_order(size);
2393 flags &= ~(GFP_DMA | GFP_DMA32);
2395 vaddr = (void *)__get_free_pages(flags, order);
2398 memset(vaddr, 0, size);
2400 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2402 hwdev->coherent_dma_mask);
2405 free_pages((unsigned long)vaddr, order);
2409 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2410 dma_addr_t dma_handle)
2414 size = PAGE_ALIGN(size);
2415 order = get_order(size);
2417 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2418 free_pages((unsigned long)vaddr, order);
2421 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2423 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2424 int nelems, int dir)
2427 struct pci_dev *pdev = to_pci_dev(hwdev);
2428 struct dmar_domain *domain;
2429 unsigned long start_addr;
2433 struct scatterlist *sg;
2434 struct intel_iommu *iommu;
2436 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2439 domain = find_domain(pdev);
2442 iommu = domain_get_iommu(domain);
2444 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2447 for_each_sg(sglist, sg, nelems, i) {
2448 addr = SG_ENT_VIRT_ADDRESS(sg);
2449 size += aligned_size((u64)addr, sg->length);
2452 start_addr = iova->pfn_lo << PAGE_SHIFT;
2454 /* clear the whole page */
2455 dma_pte_clear_range(domain, start_addr, start_addr + size);
2456 /* free page tables */
2457 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2459 if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2460 size >> VTD_PAGE_SHIFT, 0))
2461 iommu_flush_write_buffer(iommu);
2464 __free_iova(&domain->iovad, iova);
2467 static int intel_nontranslate_map_sg(struct device *hddev,
2468 struct scatterlist *sglist, int nelems, int dir)
2471 struct scatterlist *sg;
2473 for_each_sg(sglist, sg, nelems, i) {
2474 BUG_ON(!sg_page(sg));
2475 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2476 sg->dma_length = sg->length;
2481 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2486 struct pci_dev *pdev = to_pci_dev(hwdev);
2487 struct dmar_domain *domain;
2491 struct iova *iova = NULL;
2493 struct scatterlist *sg;
2494 unsigned long start_addr;
2495 struct intel_iommu *iommu;
2497 BUG_ON(dir == DMA_NONE);
2498 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2499 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2501 domain = get_valid_domain_for_dev(pdev);
2505 iommu = domain_get_iommu(domain);
2507 for_each_sg(sglist, sg, nelems, i) {
2508 addr = SG_ENT_VIRT_ADDRESS(sg);
2509 addr = (void *)virt_to_phys(addr);
2510 size += aligned_size((u64)addr, sg->length);
2513 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2515 sglist->dma_length = 0;
2520 * Check if DMAR supports zero-length reads on write only
2523 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2524 !cap_zlr(iommu->cap))
2525 prot |= DMA_PTE_READ;
2526 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2527 prot |= DMA_PTE_WRITE;
2529 start_addr = iova->pfn_lo << PAGE_SHIFT;
2531 for_each_sg(sglist, sg, nelems, i) {
2532 addr = SG_ENT_VIRT_ADDRESS(sg);
2533 addr = (void *)virt_to_phys(addr);
2534 size = aligned_size((u64)addr, sg->length);
2535 ret = domain_page_mapping(domain, start_addr + offset,
2536 ((u64)addr) & PAGE_MASK,
2539 /* clear the page */
2540 dma_pte_clear_range(domain, start_addr,
2541 start_addr + offset);
2542 /* free page tables */
2543 dma_pte_free_pagetable(domain, start_addr,
2544 start_addr + offset);
2546 __free_iova(&domain->iovad, iova);
2549 sg->dma_address = start_addr + offset +
2550 ((u64)addr & (~PAGE_MASK));
2551 sg->dma_length = sg->length;
2555 /* it's a non-present to present mapping */
2556 if (iommu_flush_iotlb_psi(iommu, domain->id,
2557 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2558 iommu_flush_write_buffer(iommu);
2562 static struct dma_mapping_ops intel_dma_ops = {
2563 .alloc_coherent = intel_alloc_coherent,
2564 .free_coherent = intel_free_coherent,
2565 .map_single = intel_map_single,
2566 .unmap_single = intel_unmap_single,
2567 .map_sg = intel_map_sg,
2568 .unmap_sg = intel_unmap_sg,
2571 static inline int iommu_domain_cache_init(void)
2575 iommu_domain_cache = kmem_cache_create("iommu_domain",
2576 sizeof(struct dmar_domain),
2581 if (!iommu_domain_cache) {
2582 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2589 static inline int iommu_devinfo_cache_init(void)
2593 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2594 sizeof(struct device_domain_info),
2598 if (!iommu_devinfo_cache) {
2599 printk(KERN_ERR "Couldn't create devinfo cache\n");
2606 static inline int iommu_iova_cache_init(void)
2610 iommu_iova_cache = kmem_cache_create("iommu_iova",
2611 sizeof(struct iova),
2615 if (!iommu_iova_cache) {
2616 printk(KERN_ERR "Couldn't create iova cache\n");
2623 static int __init iommu_init_mempool(void)
2626 ret = iommu_iova_cache_init();
2630 ret = iommu_domain_cache_init();
2634 ret = iommu_devinfo_cache_init();
2638 kmem_cache_destroy(iommu_domain_cache);
2640 kmem_cache_destroy(iommu_iova_cache);
2645 static void __init iommu_exit_mempool(void)
2647 kmem_cache_destroy(iommu_devinfo_cache);
2648 kmem_cache_destroy(iommu_domain_cache);
2649 kmem_cache_destroy(iommu_iova_cache);
2653 static void __init init_no_remapping_devices(void)
2655 struct dmar_drhd_unit *drhd;
2657 for_each_drhd_unit(drhd) {
2658 if (!drhd->include_all) {
2660 for (i = 0; i < drhd->devices_cnt; i++)
2661 if (drhd->devices[i] != NULL)
2663 /* ignore DMAR unit if no pci devices exist */
2664 if (i == drhd->devices_cnt)
2672 for_each_drhd_unit(drhd) {
2674 if (drhd->ignored || drhd->include_all)
2677 for (i = 0; i < drhd->devices_cnt; i++)
2678 if (drhd->devices[i] &&
2679 !IS_GFX_DEVICE(drhd->devices[i]))
2682 if (i < drhd->devices_cnt)
2685 /* bypass IOMMU if it is just for gfx devices */
2687 for (i = 0; i < drhd->devices_cnt; i++) {
2688 if (!drhd->devices[i])
2690 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2695 int __init intel_iommu_init(void)
2699 if (dmar_table_init())
2702 if (dmar_dev_scope_init())
2706 * Check the need for DMA-remapping initialization now.
2707 * Above initialization will also be used by Interrupt-remapping.
2709 if (no_iommu || swiotlb || dmar_disabled)
2712 iommu_init_mempool();
2713 dmar_init_reserved_ranges();
2715 init_no_remapping_devices();
2719 printk(KERN_ERR "IOMMU: dmar init failed\n");
2720 put_iova_domain(&reserved_iova_list);
2721 iommu_exit_mempool();
2725 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2727 init_timer(&unmap_timer);
2729 dma_ops = &intel_dma_ops;
2733 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2734 struct pci_dev *pdev)
2736 struct device_domain_info *info;
2737 unsigned long flags;
2739 info = alloc_devinfo_mem();
2743 info->bus = pdev->bus->number;
2744 info->devfn = pdev->devfn;
2746 info->domain = domain;
2748 spin_lock_irqsave(&device_domain_lock, flags);
2749 list_add(&info->link, &domain->devices);
2750 list_add(&info->global, &device_domain_list);
2751 pdev->dev.archdata.iommu = info;
2752 spin_unlock_irqrestore(&device_domain_lock, flags);
2757 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2758 struct pci_dev *pdev)
2760 struct device_domain_info *info;
2761 struct intel_iommu *iommu;
2762 unsigned long flags;
2764 struct list_head *entry, *tmp;
2766 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2770 spin_lock_irqsave(&device_domain_lock, flags);
2771 list_for_each_safe(entry, tmp, &domain->devices) {
2772 info = list_entry(entry, struct device_domain_info, link);
2773 if (info->bus == pdev->bus->number &&
2774 info->devfn == pdev->devfn) {
2775 list_del(&info->link);
2776 list_del(&info->global);
2778 info->dev->dev.archdata.iommu = NULL;
2779 spin_unlock_irqrestore(&device_domain_lock, flags);
2781 iommu_detach_dev(iommu, info->bus, info->devfn);
2782 free_devinfo_mem(info);
2784 spin_lock_irqsave(&device_domain_lock, flags);
2792 /* if there is no other devices under the same iommu
2793 * owned by this domain, clear this iommu in iommu_bmp
2794 * update iommu count and coherency
2796 if (device_to_iommu(info->bus, info->devfn) == iommu)
2801 unsigned long tmp_flags;
2802 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2803 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2804 domain->iommu_count--;
2805 domain_update_iommu_coherency(domain);
2806 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2809 spin_unlock_irqrestore(&device_domain_lock, flags);
2812 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2814 struct device_domain_info *info;
2815 struct intel_iommu *iommu;
2816 unsigned long flags1, flags2;
2818 spin_lock_irqsave(&device_domain_lock, flags1);
2819 while (!list_empty(&domain->devices)) {
2820 info = list_entry(domain->devices.next,
2821 struct device_domain_info, link);
2822 list_del(&info->link);
2823 list_del(&info->global);
2825 info->dev->dev.archdata.iommu = NULL;
2827 spin_unlock_irqrestore(&device_domain_lock, flags1);
2829 iommu = device_to_iommu(info->bus, info->devfn);
2830 iommu_detach_dev(iommu, info->bus, info->devfn);
2832 /* clear this iommu in iommu_bmp, update iommu count
2835 spin_lock_irqsave(&domain->iommu_lock, flags2);
2836 if (test_and_clear_bit(iommu->seq_id,
2837 &domain->iommu_bmp)) {
2838 domain->iommu_count--;
2839 domain_update_iommu_coherency(domain);
2841 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2843 free_devinfo_mem(info);
2844 spin_lock_irqsave(&device_domain_lock, flags1);
2846 spin_unlock_irqrestore(&device_domain_lock, flags1);
2849 /* domain id for virtual machine, it won't be set in context */
2850 static unsigned long vm_domid;
2852 static struct dmar_domain *iommu_alloc_vm_domain(void)
2854 struct dmar_domain *domain;
2856 domain = alloc_domain_mem();
2860 domain->id = vm_domid++;
2861 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2862 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2867 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2871 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2872 spin_lock_init(&domain->mapping_lock);
2873 spin_lock_init(&domain->iommu_lock);
2875 domain_reserve_special_ranges(domain);
2877 /* calculate AGAW */
2878 domain->gaw = guest_width;
2879 adjust_width = guestwidth_to_adjustwidth(guest_width);
2880 domain->agaw = width_to_agaw(adjust_width);
2882 INIT_LIST_HEAD(&domain->devices);
2884 domain->iommu_count = 0;
2885 domain->iommu_coherency = 0;
2887 /* always allocate the top pgd */
2888 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2891 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2895 static void iommu_free_vm_domain(struct dmar_domain *domain)
2897 unsigned long flags;
2898 struct dmar_drhd_unit *drhd;
2899 struct intel_iommu *iommu;
2901 unsigned long ndomains;
2903 for_each_drhd_unit(drhd) {
2906 iommu = drhd->iommu;
2908 ndomains = cap_ndoms(iommu->cap);
2909 i = find_first_bit(iommu->domain_ids, ndomains);
2910 for (; i < ndomains; ) {
2911 if (iommu->domains[i] == domain) {
2912 spin_lock_irqsave(&iommu->lock, flags);
2913 clear_bit(i, iommu->domain_ids);
2914 iommu->domains[i] = NULL;
2915 spin_unlock_irqrestore(&iommu->lock, flags);
2918 i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2923 static void vm_domain_exit(struct dmar_domain *domain)
2927 /* Domain 0 is reserved, so dont process it */
2931 vm_domain_remove_all_dev_info(domain);
2933 put_iova_domain(&domain->iovad);
2934 end = DOMAIN_MAX_ADDR(domain->gaw);
2935 end = end & (~VTD_PAGE_MASK);
2938 dma_pte_clear_range(domain, 0, end);
2940 /* free page tables */
2941 dma_pte_free_pagetable(domain, 0, end);
2943 iommu_free_vm_domain(domain);
2944 free_domain_mem(domain);
2947 struct dmar_domain *intel_iommu_alloc_domain(void)
2949 struct dmar_domain *domain;
2951 domain = iommu_alloc_vm_domain();
2954 "intel_iommu_domain_alloc: domain == NULL\n");
2957 if (vm_domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2959 "intel_iommu_domain_alloc: domain_init() failed\n");
2960 vm_domain_exit(domain);
2966 EXPORT_SYMBOL_GPL(intel_iommu_alloc_domain);
2968 void intel_iommu_free_domain(struct dmar_domain *domain)
2970 vm_domain_exit(domain);
2972 EXPORT_SYMBOL_GPL(intel_iommu_free_domain);
2974 int intel_iommu_attach_device(struct dmar_domain *domain,
2975 struct pci_dev *pdev)
2979 /* normally pdev is not mapped */
2980 if (unlikely(domain_context_mapped(pdev))) {
2981 struct dmar_domain *old_domain;
2983 old_domain = find_domain(pdev);
2985 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
2986 vm_domain_remove_one_dev_info(old_domain, pdev);
2988 domain_remove_dev_info(old_domain);
2992 ret = domain_context_mapping(domain, pdev);
2996 ret = vm_domain_add_dev_info(domain, pdev);
2999 EXPORT_SYMBOL_GPL(intel_iommu_attach_device);
3001 void intel_iommu_detach_device(struct dmar_domain *domain,
3002 struct pci_dev *pdev)
3004 vm_domain_remove_one_dev_info(domain, pdev);
3006 EXPORT_SYMBOL_GPL(intel_iommu_detach_device);
3008 int intel_iommu_map_address(struct dmar_domain *domain, dma_addr_t iova,
3009 u64 hpa, size_t size, int prot)
3012 ret = domain_page_mapping(domain, iova, hpa, size, prot);
3015 EXPORT_SYMBOL_GPL(intel_iommu_map_address);
3017 void intel_iommu_unmap_address(struct dmar_domain *domain,
3018 dma_addr_t iova, size_t size)
3022 /* The address might not be aligned */
3023 base = iova & VTD_PAGE_MASK;
3024 size = VTD_PAGE_ALIGN(size);
3025 dma_pte_clear_range(domain, base, base + size);
3027 EXPORT_SYMBOL_GPL(intel_iommu_unmap_address);
3029 int intel_iommu_found(void)
3031 return g_num_of_iommus;
3033 EXPORT_SYMBOL_GPL(intel_iommu_found);
3035 u64 intel_iommu_iova_to_phys(struct dmar_domain *domain, u64 iova)
3037 struct dma_pte *pte;
3040 pte = addr_to_dma_pte(domain, iova);
3042 phys = dma_pte_addr(pte);
3046 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_phys);